In [1]:
import pandas as pd
import numpy as np
import datasets
import os
import re
from torch.utils.data import Dataset

import nltk.data
from transformers import AutoTokenizer



libgomp: Invalid value for environment variable OMP_NUM_THREADS

libgomp: Invalid value for environment variable OMP_NUM_THREADS


### Preprocess selected babi text file

In [2]:
def get_dataset_df(dataset_path):
    with open(dataset_path, 'r') as f:
        texts = f.read().strip()
        texts = texts.split('\n')
        df = pd.DataFrame(texts, columns=['text'])

    # parse samples
    df['phrase_num'] = df.text.apply(lambda x: int(x.split(' ')[0]))
    df.text = df.text.apply(lambda x: x[x.index(' ') + 1:])
    df['answer'] = df.text.apply(lambda x: x[x.index('\t') + 1:] if '\t' in x else None)
    # df['reference_num'] = df.answer.apply(lambda x: x if x is None else x.split('\t| ')[1:])
    df['reference_num'] = df.answer.apply(lambda x: x if x is None else [int(n) for n in re.split('\t| ', x)[1:]])
    df.answer = df.answer.apply(lambda x: x if x is None else x.split('\t')[0])
    df.text = df.text.apply(lambda x: x.split('\t')[0] if '\t' in x else x)

    # mark each sample
    sample_start_inds = list(np.where(df.phrase_num == 1)[0]) + [df.shape[0]]
    for i, (start, end) in enumerate(zip(sample_start_inds, sample_start_inds[1:])):
        df.loc[start:end, 'initial_sample_num'] = i

    df.initial_sample_num = df.initial_sample_num.astype(int)

    # multiple questions in sample -> samples with single question
    initial_samples = [df[df.initial_sample_num == sn] for sn in df.initial_sample_num.unique()]

    single_question_slices = []
    for sample in initial_samples:
        answer_positions = sample[~sample.answer.isna()].index
        slices = [sample[:ans_pos+1] for ans_pos in answer_positions]
        for i, slc in enumerate(slices):
            slices[i] = slc[(slc.answer.isna()) | (slc.index == slc.index[-1])]
        single_question_slices += slices
    
    df = pd.concat(single_question_slices).reset_index(drop=True)

    # mark each sample again
    sample_start_inds = list(np.where(df.phrase_num == 1)[0]) + [df.shape[0]]
    for i, (start, end) in enumerate(zip(sample_start_inds, sample_start_inds[1:])):
        df.loc[start:end, 'sample_num'] = i

    df.sample_num = df.sample_num.astype(int)
    
    return df

In [3]:
# dataset = datasets.load_dataset("facebook/babi_qa", "en-10k-qa1") # fails for me
train_path = "/home/bulatov/datasets/babi/tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_train.txt"
test_path = "/home/bulatov/datasets/babi/tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_test.txt"
noise_dataset = datasets.load_dataset("wikitext", "wikitext-2-raw-v1")


In [3]:
data_folder = '/home/bulatov/datasets/babi/tasks_1-20_v1-2/en-10k'
fns = next(os.walk(data_folder))[2]

In [146]:
# test that all 20 babi datasets can be opened
i = 0
for i in range(len(fns)):
    fn = fns[i]
    dataset_path = os.path.join(data_folder, fn)

    df = get_dataset_df(dataset_path)
    

### Task dataset
Each sample is one task that contains facts, question, answer and reference facts

In [4]:
class TaskDataset(Dataset):
    def __init__(self, dataset_path):
        self.fact_dataset = get_dataset_df(dataset_path)

    def __getitem__(self, ind):
        slc = self.fact_dataset[self.fact_dataset.sample_num == ind]
        references = slc[slc.phrase_num.isin(slc.reference_num.values[-1])].text.values
        sample = {'facts': slc.text.values[:-1],
                  'question': slc.text.values[-1],
                  'answer': slc.answer.values[-1],
                  'references': references}
        return sample
    
    def __len__(self):
        return self.fact_dataset.sample_num.max()


In [6]:
dataset_path = "/home/jovyan/rmt/datasets/babi/data/tasks_1-20_v1-2/en-10k/qa3_three-supporting-facts_test.txt"
task_dataset = TaskDataset(dataset_path)
sample = task_dataset[10]
# sample

### Noise dataset
Noise dataset samples background text by sentences.

Sample_size is defined in tokens

In [7]:
sentence_tokenizer = nltk.PunktSentenceTokenizer()
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [8]:
def sum_lengths(sentences):
    return sum([len(s) for s in sentences])


class SentenceSampler:
    def __init__(self, dataset, tokenizer):
        self.sample_ind = 0
        self.dataset = dataset
        self.sentences = []
        self.tokenizer = tokenizer
        self.sentence_tokenizer = nltk.PunktSentenceTokenizer()

    def get_sample(self, sample_size):
        total_tokens = sum_lengths(self.sentences)
        while total_tokens < sample_size: # add a new dataset item
            text = self.dataset[self.sample_ind]['text']
            self.sample_ind += 1
            sentences = self.sentence_tokenizer.tokenize(text)
            tokenized = [self.tokenizer.encode(s, add_special_tokens=False) for s in sentences]
            self.sentences += tokenized
            total_tokens += sum_lengths(tokenized)

        sample = []
        sample_tokens = 0
        for sent in self.sentences: # add new sentence until sample_size is reached
            sample_tokens += len(sent)
            if sample_tokens >= sample_size:
                break
            sample.append(sent)
            self.sentences = self.sentences[1:]
        
        return sample

In [10]:
noise_sampler = SentenceSampler(noise_dataset['train'], tokenizer=tokenizer)

class NoiseInjectionDataset(Dataset):
    def __init__(self, task_dataset, noise_sampler, tokenizer, sample_size=100):
        self.task_dataset = task_dataset
        self.noise_sampler = noise_sampler
        self.sample_size = sample_size
        self.tokenizer = tokenizer

    def __getitem__(self, ind):
        sample = self.task_dataset[ind]
        facts_tok = self.tokenizer(list(sample['facts']))['input_ids']
        question_tok = self.tokenizer(sample['question'])['input_ids']
        answer_tok = self.tokenizer(sample['answer'])['input_ids']
        
        task_len = sum_lengths(facts_tok) + len(question_tok) + len(answer_tok)
        background_text_len = self.sample_size - task_len
        background_text = self.noise_sampler.get_sample(background_text_len)
        sample['background_text'] = background_text

        possible_positions = range(len(background_text) + 1) 
        fact_positions = np.random.choice(possible_positions, len(facts_tok))
        fact_positions.sort()

        updated_sample = [[] for _ in range(len(background_text) + 1)] 
        for fact, pos in zip(facts_tok, fact_positions):
            updated_sample[pos].append(fact)

        updated_sample[-1].append(question_tok)

        for i, s in enumerate(background_text):
            updated_sample[i].append(s)

        flat = [i for s in updated_sample for i in s]
        tokens = [i for s in flat for i in s]

        sample['input_tokens'] = tokens
        sample['target_tokens'] = answer_tok

        return sample
    
    def __len__(self):
        return self.task_dataset

In [11]:
sample_size = 512
noise_injection_dataset = NoiseInjectionDataset(task_dataset=task_dataset,
                                                noise_sampler=noise_sampler,
                                                tokenizer=tokenizer,
                                                sample_size=sample_size)

### Visualize one sample

In [12]:
sample = noise_injection_dataset[10]
sample.keys()

dict_keys(['facts', 'question', 'answer', 'references', 'background_text', 'input_tokens', 'target_tokens'])

In [13]:
facts = sample['facts']
question = sample['question']
answer = tokenizer.decode(sample['target_tokens'])

background_text = tokenizer.batch_decode(sample['background_text'])

input_tokens = tokenizer.decode(sample['input_tokens'])

print(f"Facts: {' '.join(facts)}")
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"References: {' '.join(sample['references'])}")
print()
print('Background text: ', ' '.join(background_text))
print('Combined input: ', input_tokens)
print(f"Target: {answer}")


Facts: Sandra moved to the bathroom. John picked up the football. John dropped the football. Daniel went to the kitchen. Daniel got the apple. John went to the kitchen. Mary went back to the hallway. Daniel travelled to the office. John travelled to the garden. John journeyed to the kitchen. Mary moved to the kitchen. Daniel moved to the garden. Mary journeyed to the bathroom. Daniel grabbed the milk. Mary went to the hallway. Mary got the football there. Mary dropped the football. Sandra moved to the bedroom. Mary went back to the office. John travelled to the office. Mary went back to the bathroom. John moved to the hallway. Sandra went to the office. Mary journeyed to the kitchen. Sandra travelled to the garden. John went back to the garden. Sandra went back to the office. Mary went to the hallway. Daniel went to the bedroom. Mary picked up the football there. John travelled to the kitchen. Mary moved to the bedroom. Sandra went to the bathroom. Daniel put down the milk. Daniel disc