In [1]:
import sys
import datasets
import numpy as np
from transformers import AutoTokenizer
sys.path.append("..")
from babilong_utils import TaskDataset, SentenceSampler, NoiseInjectionDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ### extract dataset archive
# !unzip ../data/tasks_1-20_v1-2.zip -d ../data/

In [3]:
noise_dataset_name = "pg19" 
noise_dataset_split = "validation"
#print(f'preparing dataset for {args.task_dataset}')
noise_dataset = datasets.load_dataset(noise_dataset_name, noise_dataset_split)

Downloading data: 100%|██████████████████████| 906k/906k [00:00<00:00, 3.36MB/s]
Downloading data: 100%|██████████████████████| 356k/356k [00:00<00:00, 2.94MB/s]
Downloading data: 100%|██████████████████████| 356k/356k [00:00<00:00, 2.99MB/s]
Downloading data: 100%|██████████████████████| 521k/521k [00:00<00:00, 4.16MB/s]
Downloading data: 100%|██████████████████████| 157k/157k [00:00<00:00, 3.21MB/s]
Downloading data: 100%|██████████████████████| 111k/111k [00:00<00:00, 2.65MB/s]
Downloading data: 100%|████████████████████| 99.7k/99.7k [00:00<00:00, 2.34MB/s]
Downloading data: 100%|████████████████████| 49.9k/49.9k [00:00<00:00, 1.27MB/s]
Downloading data: 100%|████████████████████| 51.4k/51.4k [00:00<00:00, 1.88MB/s]
Downloading data: 100%|████████████████████| 51.4k/51.4k [00:00<00:00, 1.38MB/s]
Downloading data: 100%|████████████████████| 99.2k/99.2k [00:00<00:00, 1.98MB/s]
Downloading data: 100%|██████████████████████| 904k/904k [00:00<00:00, 4.39MB/s]
Downloading data: 100%|█████

In [9]:
noise_dataset_name = "pg19" 
noise_dataset_split = "test"
#print(f'preparing dataset for {args.task_dataset}')
noise_dataset = datasets.load_dataset(noise_dataset_name, noise_dataset_split)

In [8]:
download_config=datasets.utils.DownloadConfig(resume_download=True)

datasets.load_dataset(noise_dataset_name, noise_dataset_split, download_config=download_config)


DatasetDict({
    train: Dataset({
        features: ['short_book_title', 'publication_date', 'url', 'text'],
        num_rows: 28602
    })
    validation: Dataset({
        features: ['short_book_title', 'publication_date', 'url', 'text'],
        num_rows: 50
    })
    test: Dataset({
        features: ['short_book_title', 'publication_date', 'url', 'text'],
        num_rows: 100
    })
})

In [10]:
pg19_train = datasets.load_dataset("pg19", "train")

Generating train split: 28602 examples [00:34, 818.39 examples/s] 
Generating validation split: 50 examples [00:00, 550.34 examples/s]
Generating test split: 100 examples [00:00, 581.73 examples/s]


In [27]:
mmax = 0
mmin = float("inf")
longest = shortest = None

for i, t in enumerate(pg19_train["train"]["text"]):
    if len(t) < mmin:
        shortest = i
        mmin = len(t)

    if len(t) > mmax:
        longest = i
        mmax = len(t)



In [31]:
print("shortest book:", pg19_train["train"][shortest]["short_book_title"])
print("longest book:", pg19_train["train"][longest]["short_book_title"])

shortest book: Hark! The Herald Angels Sing by Felix Mendelssohn
longest book: Webster's Unabridged Dictionary by Various


In [9]:
!ls ../data/

VerificationMode = datasets.VerificationMode

tasks_1-20_v1-2  tasks_1-20_v1-2.zip


In [4]:
train_path = "../data/tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_train.txt"
test_path = "../data/tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_test.txt"

noise_dataset = datasets.load_dataset("wikitext", "wikitext-2-raw-v1")

### Load task datasets

In [5]:
# task 
task_dataset_train = TaskDataset(train_path)
task_dataset_test = TaskDataset(test_path)

In [6]:
# background text
tokenizer = AutoTokenizer.from_pretrained('gpt2')

noise_sampler_train = SentenceSampler(noise_dataset['train'], tokenizer=tokenizer)
noise_sampler_test = SentenceSampler(noise_dataset['test'], tokenizer=tokenizer)

In [10]:
sample_size = 64               # max number of tokens in sample
dataset_train = NoiseInjectionDataset(task_dataset=task_dataset_train,
                                        noise_sampler=noise_sampler_train,
                                        tokenizer=tokenizer,
                                        sample_size=sample_size)

dataset_test = NoiseInjectionDataset(task_dataset=task_dataset_test,
                                        noise_sampler=noise_sampler_test,
                                        tokenizer=tokenizer,
                                        sample_size=sample_size)

In [11]:
sample = dataset_train[0]


sample.keys()


dict_keys(['facts', 'question', 'answer', 'references', 'background_text', 'fact_positions', 'input_tokens', 'target_tokens'])

In [12]:
print(len(sample['input_tokens']), len(sample['target_tokens']), type(sample['background_text'][0])) 
sum(len(l) for l in sample['background_text'])

45 2 <class 'list'>


28

### Visualize one sample

In [13]:
facts = sample['facts']
question = sample['question']
answer = tokenizer.decode(sample['target_tokens'])

background_text = tokenizer.batch_decode(sample['background_text'])

input_tokens = tokenizer.decode(sample['input_tokens'])

print(f"Facts: {' '.join(facts)}")
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"References: {' '.join(sample['references'])}")
print()
print('Background text: ', ' '.join(background_text))
print('Fact positions: ', sample['fact_positions'])
print('Combined input: ', input_tokens)

print(f"Target: {answer}")


Facts: Mary moved to the bathroom. John went to the hallway.
Question: Where is Mary? 
Answer: bathroom
References: Mary moved to the bathroom.

Background text:  The player progresses through a series of linear missions, gradually unlocked as maps that can be freely scanned through and replayed as they are unlocked.
Fact positions:  [0 1]
Combined input:  Mary moved to the bathroom.The player progresses through a series of linear missions, gradually unlocked as maps that can be freely scanned through and replayed as they are unlocked.John went to the hallway.Where is Mary? 
Target: bathroom


### collate function

In [14]:
import torch
from torch.nn.utils.rnn import pad_sequence

id_pad_value = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
gen_token = tokenizer.encode('GEN')[0]
eos_token = tokenizer.eos_token_id

def collate_fn(batch):
    targets = [torch.tensor(b['target_tokens']) for b in batch]
    input_ids = [torch.tensor(b['input_tokens'] + [gen_token] + b['target_tokens'] + [eos_token]) for b in batch]
    gen_inputs = [torch.tensor(b['input_tokens'] + [gen_token]) for b in batch]

    attention_mask = [torch.ones_like(b, dtype=int) for b in input_ids]
    labels_mask = [torch.zeros_like(b, dtype=bool) for b in input_ids]
    for m, t in zip(labels_mask, targets):
        m[-len(t) - 2:] = True

    input_ids = pad_sequence(input_ids, padding_value=id_pad_value, batch_first=True)
    gen_inputs = pad_sequence(gen_inputs, padding_value=id_pad_value, batch_first=True)
    # labels = pad_sequence(input_ids, padding_value=-100, batch_first=True)
    attention_mask = pad_sequence(attention_mask, padding_value=0, batch_first=True)
    labels_mask = pad_sequence(labels_mask, padding_value=0, batch_first=True)

    collated = {}
    collated['input_ids'] = collated['labels'] = input_ids
    collated['input_ids_generate'] = gen_inputs
    collated['labels_mask'] = labels_mask
    collated['attention_mask'] = attention_mask.bool()
    collated['attention_mask_generate'] = (gen_inputs != id_pad_value).bool()

    collated['target_text'] = [b['answer'] for b in batch]
    return collated

In [18]:
batch = [dataset_train[i] for i in range(2)]
collated = collate_fn(batch)
collated.keys()

dict_keys(['input_ids', 'labels', 'input_ids_generate', 'labels_mask', 'attention_mask', 'attention_mask_generate', 'target_text'])

In [36]:

def visualize_masked_prediction(collated):
    for i, (input_ids, labels_mask) in enumerate(zip(collated['input_ids'], collated['labels_mask'])):
        print(f"Sample #{i}")
        
        input_t = label_t = np.asarray( tokenizer.batch_decode(input_ids) )
        
        shift_labels = label_t[1:]
        shift_inputs = input_t[:-1]
        shift_mask = labels_mask[:-1]
        #print(labels_mask)
        print("Model INPUT at each step:", shift_inputs[shift_mask])
        print("Model LABEL at each step:", shift_labels[shift_mask])
        print()
        #for i in range(len(decoded_inputs)):
        #    print("masked input:", dec_inputs[i])


visualize_masked_prediction(collated)

Sample #0
Model INPUT at each step: ['GEN' 'bath' 'room']
Model LABEL at each step: ['bath' 'room' '<|endoftext|>']

Sample #1
Model INPUT at each step: ['GEN' 'hall' 'way' '<|endoftext|>']
Model LABEL at each step: ['hall' 'way' '<|endoftext|>' '<|endoftext|>']



In [22]:
# labels are marked with labels_mask
tokenizer.batch_decode([c[m] for c, m in zip(collated['input_ids'], collated['labels_mask'])])

['GENbathroom<|endoftext|>',
 'GENhallway<|endoftext|>',
 'GENhallway<|endoftext|>',
 'GENoffice<|endoftext|>',
 'GENbathroom<|endoftext|>',
 'GENbathroom<|endoftext|>',
 'GENbathroom<|endoftext|>',
 'GENbathroom<|endoftext|>',
 'GENoffice<|endoftext|>',
 'GENhallway<|endoftext|>']

In [24]:
# different input_ids for .forward() and .generate()
tokenizer.batch_decode([c[m] for c, m in zip(collated['input_ids'], collated['attention_mask'])][:1])

[' Troops are divided into five classes : Scouts, Shocktroopers, Engineers, Lancers and Armored Soldier.Troopers can switch classes by changing their assigned weapon.Changing class does not greatly affect the stats gained while in a previous class.With victory in battle, experience points are awarded to the squad, which are distributed into five different attributes shared by the entire squad, a feature differing from early games\'method of distributing to different unit types. = = Plot = = The game takes place during the Second Europan War.Gallian Army Squad 422, also known as " The Nameless ", are a penal military unit composed of criminals, foreign deserters, and military offenders whose real names are erased from the records and thereon officially referred to by numbers.Ordered by the Gallian military to perform the most dangerous missions that the Regular Army and Militia will not do, they are nevertheless up to the task, exemplified by their motto, Altaha Abilia, meaning " Always

In [25]:
tokenizer.batch_decode([c[m] for c, m in zip(collated['input_ids_generate'], collated['attention_mask_generate'])][:1])

[' Troops are divided into five classes : Scouts, Shocktroopers, Engineers, Lancers and Armored Soldier.Troopers can switch classes by changing their assigned weapon.Changing class does not greatly affect the stats gained while in a previous class.With victory in battle, experience points are awarded to the squad, which are distributed into five different attributes shared by the entire squad, a feature differing from early games\'method of distributing to different unit types. = = Plot = = The game takes place during the Second Europan War.Gallian Army Squad 422, also known as " The Nameless ", are a penal military unit composed of criminals, foreign deserters, and military offenders whose real names are erased from the records and thereon officially referred to by numbers.Ordered by the Gallian military to perform the most dangerous missions that the Regular Army and Militia will not do, they are nevertheless up to the task, exemplified by their motto, Altaha Abilia, meaning " Always

### Create a dataloader

In [14]:
from torch.utils.data import DataLoader

dl = DataLoader(batch_size=2, dataset=dataset_train, collate_fn=collate_fn)
gen = iter(dl)
batch = next(gen)
batch.keys()

dict_keys(['input_ids', 'labels', 'input_ids_generate', 'labels_mask', 'attention_mask', 'attention_mask_generate', 'target_text'])