In [1]:
import numpy as np
import sys
import re
import datasets
from transformers import AutoTokenizer
sys.path.append("..")
from babilong_utils import TaskDataset, SentenceSampler, NoiseInjectionDataset

In [2]:
# ### extract dataset archive
# !unzip ../data/tasks_1-20_v1-2.zip -d ../data/

In [3]:
import torch
from torch.nn.utils.rnn import pad_sequence
tokenizer = AutoTokenizer.from_pretrained('gpt2')

id_pad_value = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
gen_token = tokenizer.encode('GEN')[0]
eos_token = tokenizer.eos_token_id

def collate_fn(batch):
    targets = [torch.tensor(b['target_tokens']) for b in batch]
    input_ids = [torch.tensor(b['input_tokens'] + b['question_tokens'] + [gen_token] + b['target_tokens'] + [eos_token]) for b in batch]
    gen_inputs = [torch.tensor(b['input_tokens'] + b['question_tokens'] + [gen_token]) for b in batch]

    attention_mask = [torch.ones_like(b, dtype=int) for b in input_ids]
    labels_mask = [torch.zeros_like(b, dtype=bool) for b in input_ids]
    for m, t in zip(labels_mask, targets):
        m[-len(t) - 2:] = True

    input_ids = pad_sequence(input_ids, padding_value=id_pad_value, batch_first=True)
    gen_inputs = pad_sequence(gen_inputs, padding_value=id_pad_value, batch_first=True)
    attention_mask = pad_sequence(attention_mask, padding_value=0, batch_first=True)
    labels_mask = pad_sequence(labels_mask, padding_value=0, batch_first=True)

    collated = {}
    collated['input_ids'] = collated['labels'] = input_ids
    collated['input_ids_generate'] = gen_inputs
    collated['labels_mask'] = labels_mask
    collated['attention_mask'] = attention_mask.bool()
    collated['attention_mask_generate'] = (gen_inputs != id_pad_value).bool()
    collated['target_text'] = [b['answer'] for b in batch]
    return collated

In [4]:
# train_path = "../data/tasks_1-20_v1-2/en-10k/qa4_two-arg-relations_train.txt"
# test_path = "../data/tasks_1-20_v1-2/en-10k/qa4_two-arg-relations_test.txt"

# train_path = "../data/tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_train.txt"
# test_path = "../data/tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_test.txt"

# train_path = "../data/tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_train.txt"
# test_path = "../data/tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_test.txt"

# train_path = "../data/tasks_1-20_v1-2/en-10k/qa3_three-supporting-facts_train.txt"
# test_path = "../data/tasks_1-20_v1-2/en-10k/qa3_three-supporting-facts_test.txt"

# train_path = "../data/tasks_1-20_v1-2/en-10k/qa5_three-arg-relations_train.txt"
# test_path = "../data/tasks_1-20_v1-2/en-10k/qa5_three-arg-relations_test.txt"

noise_dataset = datasets.load_dataset("pg19")

Using custom data configuration default
Reusing dataset pg19 (/home/jovyan/.cache/huggingface/datasets/pg19/default/0.1.0/64837d6fce7251337df051ca74e9a5435d1c9cb7f3033ba257826e44d338f83c)


In [92]:
task = "qa1_single-supporting-fact"
task = "qa2_two-supporting-facts"
task = "qa3_three-supporting-facts"
task = "qa4_two-arg-relations"
task = "qa5_three-arg-relations"

train_path = f"../data/tasks_1-20_v1-2/en-10k/{task}_train.txt"
task_dataset_train = TaskDataset(train_path, max_n_facts=None)

noise_sampler_train = SentenceSampler(noise_dataset['train'], tokenizer=tokenizer, shuffle=True)

max_n_segments = 1
segment_size = 512               # max number of tokens in sample
qa_margin = 0
sample_size = max_n_segments * segment_size - qa_margin

dataset_train = NoiseInjectionDataset(task_dataset=task_dataset_train,
                                        noise_sampler=noise_sampler_train,
                                        tokenizer=tokenizer,
                                        sample_size=sample_size)

In [174]:
i = np.random.randint(100)
sample = dataset_train[i]
collated = collate_fn([sample])

# text = tokenizer.decode(collated['input_ids'][0])
text = tokenizer.decode(sample['input_tokens'])

question = sample['question']
answer = sample['answer']

In [175]:
sentences = [tokenizer.decode(s) for s in sample['background_text']]
sentences = [re.sub("\n", ' ', s) for s in sentences]

In [176]:
add = 0
for i, p in enumerate(sample['fact_positions']):
    fact = sample['facts'][i]
    fact = "\textbf{" + fact + "}"
    sentences.insert(p + add, fact)
    add += 1


In [177]:
# sentences += ["\textbf{" + f"Question: {question} Answer: {answer}" + '}']
sentences += [' ...']
# sentences

In [178]:
# facts
'\textbf{Facts:} ' + ' '.join(sample['facts'])

'\textbf{Facts:} Fred grabbed the football there. Jeff took the apple there. Jeff dropped the apple. Bill picked up the apple there. Mary travelled to the kitchen. Mary went back to the hallway. Bill went to the garden. Fred travelled to the garden. Bill passed the apple to Fred. Fred left the apple. Fred went back to the hallway. Fred handed the football to Mary.'

In [179]:
# text
'\textbf{Context:} ' + ' '.join(sentences)

"\textbf{Context:} It was evident that the besiegers were in no hurry; that they were living upon the provisions left in the valley; and that it was their intention to reduce the besieged by famine. \textbf{Fred grabbed the football there.} \textbf{Jeff took the apple there.} In fact the inhabitants of the Val d'Avon had been able to carry with them only a small quantity of provisions. \textbf{Jeff dropped the apple.} We have described the three kinds of porcelain made in Hizen for exportation to Europe, and we have seen that by the middle of the seventeenth century this commerce, in the hands of the Dutch, and to some extent of the Chinese, had already attained large proportions. Before turning to the kilns that sprung up in other parts of Japan during the eighteenth century--of these the origin in every case can be traced back directly or indirectly to the early Hizen factories--we must say a word about some other varieties of porcelain made in the same neighbourhood, but not destine

In [180]:
# task
"\textbf{Question:} " + question

'\textbf{Question:} What did Bill give to Fred? '

In [181]:
# answer 
"\textbf{Answer:} " + answer

'\textbf{Answer:} apple'

In [41]:
sample['fact_positions']

array([11, 12])

In [38]:
fact_positions = [(text.index(f), text.index(f)+len(f)) for f in sample['facts']]
split_inds = [p for fp in fact_positions for p in fp]

splited = np.split(list(text), split_inds)
splited = [''.join(s) for s in splited]

In [39]:
splited

['But I find a yet more unmistakable evidence in support of my contention\nin the extraordinary emotional sensibility revealed by these headlines\nwhenever some unfortunate person has been sentenced to death for\nthe most commonplace murder.There is clearly a profound conviction\nthat the jury who heard the evidence, the judge who pronounced their\nverdict of guilty, the only possible conclusion they could reasonable\ncome to, and the HOME SECRETARY who found himself unable to recommend\na reprieve, were, one and all, engaged in a cold-blooded conspiracy\nagainst a perfectly innocent man.The convict has said to himself, and\nthat seems to be considered sufficient.And so, night after night, the\nauthors of these headlines harrow themselves by announcing such items\nas "Blank protests his innocence to his Solicitor.""Distressing Scene on the Scaffold."Consider the strain of all these alterations of hope and despair,\nrepeated time after time, and almost invariably without even the\nconso

['But I find a yet more unmistakable evidence in support of my contention\nin the extraordinary emotional sensibility revealed by these headlines\nwhenever some unfortunate person has been sentenced to death for\nthe most commonplace murder.There is clearly a profound conviction\nthat the jury who heard the evidence, the judge who pronounced their\nverdict of guilty, the only possible conclusion they could reasonable\ncome to, and the HOME SECRETARY who found himself unable to recommend\na reprieve, were, one and all, engaged in a cold-blooded conspiracy\nagainst a perfectly innocent man.The convict has said to himself, and\nthat seems to be considered sufficient.And so, night after night, the\nauthors of these headlines harrow themselves by announcing such items\nas "Blank protests his innocence to his Solicitor.""Distressing Scene on the Scaffold."Consider the strain of all these alterations of hope and despair,\nrepeated time after time, and almost invariably without even the\nconso

### Load task datasets

In [4]:
# task 
max_n_facts = 10
task_dataset_train = TaskDataset(train_path, max_n_facts=max_n_facts)
task_dataset_test = TaskDataset(test_path, max_n_facts=max_n_facts)

In [29]:
# background text
tokenizer = AutoTokenizer.from_pretrained('gpt2')

noise_sampler_train = SentenceSampler(noise_dataset['train'], tokenizer=tokenizer)
noise_sampler_test = SentenceSampler(noise_dataset['test'], tokenizer=tokenizer)

In [30]:
max_n_segments = 2
segment_size = 512               # max number of tokens in sample
qa_margin = 20
sample_size = max_n_segments * segment_size - qa_margin

dataset_train = NoiseInjectionDataset(task_dataset=task_dataset_train,
                                        noise_sampler=noise_sampler_train,
                                        tokenizer=tokenizer,
                                        sample_size=sample_size)

dataset_test = NoiseInjectionDataset(task_dataset=task_dataset_test,
                                        noise_sampler=noise_sampler_test,
                                        tokenizer=tokenizer,
                                        sample_size=sample_size)

In [101]:
# for n_seg in range(40):
#     sample_size = (512 - 1) * n_seg
#     print(n_seg, sample_size % 512)


In [102]:
sample = dataset_train[0]
sample.keys()

dict_keys(['facts', 'question', 'answer', 'references', 'question_tokens', 'target_tokens', 'background_text', 'fact_positions', 'input_tokens'])

In [103]:
sample['facts']

array(['Bill travelled to the office.',
       'Bill picked up the football there.', 'Bill went to the bedroom.',
       'Bill gave the football to Fred.'], dtype=object)

In [104]:
i = 120

In [106]:
# # i += 1
# print(i)
# text = tokenizer.decode(babylon_samples[i]['input_ids'][0])
# text[text.lower().index('babylon'):]

In [107]:
i += 1
print(i)
text = tokenizer.decode(babylon_samples[i]['input_ids'][0])
text[text.lower().index('babylon'):]

121


NameError: name 'babylon_samples' is not defined

In [77]:
i += 1
print(i)
text = tokenizer.decode(babylon_samples[i]['input_ids'][0])
text[text.lower().index('babylon'):]

135


'Babylonia, like Egypt, is one network of canals, the largest of\nwhich is navigable.It is far the best corn-land of all the countries I\nknow.There is no attempt at arboriculture--figs or vines or olives--but\nit is such superb corn-land that the average yield is two-hundredfold,\nand three-hundredfold in the best years.Mary moved to the bathroom.The wheat and barley there are\na good four inches broad in the blade, and millet and sesame grow as big\nas trees--but I will not state the dimensions I have ascertained,\nbecause I know that, for anyone who has not visited Babylonia and\nwitnessed these facts about the crops for himself, they would be\naltogether beyond belief."Harnessed in the irrigation channels, the Tigris and Euphrates had\nbecome as mighty forces of production as the Nile and the Ganges, the\nYangtse and the Hoang-Ho."This," Herodotus adds[54], "is the best demonstration I can give of the\nwealth of the Babylonians: All the lands ruled by the King of Persia are\nassess

In [83]:
i += 1
print(i)
text = tokenizer.decode(babylon_samples[i]['input_ids'][0])
text[text.lower().index('babylon'):]

141


"Babylonian tower of smoke into the rain, and oppressing men's hearts\nwith the scream of her whistle.The engineer was there himself; he paled\nas he made the signal: the engine came at a foot's pace; but the whole\nbulk of mountain shook and seemed to nod seaward, and the watching\nnavvies instinctively clutched at shrubs and trees: vain precautions,\nvain as the shots from the poor sailors.Once again fear was\ndisappointed; the train passed unscathed; and Norris, drawing a long\nbreath, remembered the labouring ship and glanced below.So the days and the nights passed: Homeric labour in Homeric\ncircumstance.Carthew was sick with sleeplessness and coffee; his hands,\nsoftened by the wet, were cut to ribbons; yet he enjoyed a peaceWhere is Mary? GENbathroom<|endoftext|>"

In [105]:
i += 1
print(i)
text = tokenizer.decode(babylon_samples[i]['input_ids'][0])
text[text.lower().index('babylon'):]

162


'Babylonian walls.The nothingness of kingly greatness and national pride were never\nbefore so finely contemned as by the voluptuous Assyrian, and were\nthe scorn not mitigated by the skilful intermixture of mercifulness\nand philanthropy, the character would not be endurable.John went to the hallway.But when the\nsame voice which pronounced contempt on the toils of honour says,\n\n\n      Enough\nFor me if I can make my subjects feel\nThe weight of human misery less,\n\n\nit is impossible to repress the liking which the humane spirit of\nthat thought is calculated to inspire.Nor is there any want of\ndignity in Sardanapalus, even when lolling softest in his luxury.Must I consume my life--this little life--\nIn guarding against all may make it less!It is not worth so much--It were to die\nBefore my hour to live in dread of death.Till now no drop of an Assyrian vein\nHath flow\'d for me, nor hath the smallest coin\nOf Nineveh\'s vast treasure e\'er been lavish\'d\nOn objects which could

In [60]:
len(babylon_samples)

153

In [33]:
babylon_samples = []

In [96]:
for _ in range(10000):
    sample = dataset_train[0]
    collated = collate_fn([sample])
    text = tokenizer.decode(collated['input_ids'][0])
    if 'babylon' in text.lower():
        print('Found it!')
        babylon_samples.append(collated)

Found it!
Found it!
Found it!
Found it!
Found it!
Found it!
Found it!
Found it!
Found it!
Found it!
Found it!
Found it!
Found it!
Found it!
Found it!
Found it!
Found it!
Found it!
Found it!
Found it!


KeyboardInterrupt: 

In [107]:
i = 135
print(i)
text = tokenizer.decode(babylon_samples[i]['input_ids'][0])
# print(text[text.lower().index('babylon') - 100:])
print(text)

135
"The land," writes Herodotus[52], who saw it in its prime, "has a little
rain, and this nourishes the corn at the root; but the crops are matured
and brought to harvest by water from the river--not, as in Egypt, by the
river flooding over the fields, but by human labour and _shadufs_[53]
For Babylonia, like Egypt, is one network of canals, the largest of
which is navigable.It is far the best corn-land of all the countries I
know.There is no attempt at arboriculture--figs or vines or olives--but
it is such superb corn-land that the average yield is two-hundredfold,
and three-hundredfold in the best years.Mary moved to the bathroom.The wheat and barley there are
a good four inches broad in the blade, and millet and sesame grow as big
as trees--but I will not state the dimensions I have ascertained,
because I know that, for anyone who has not visited Babylonia and
witnessed these facts about the crops for himself, they would be
altogether beyond belief."Harnessed in the irrigation cha

### Visualize one sample

In [25]:
input_lens = [len(dataset_train[i]['input_tokens']) for i in range(1000)]

In [26]:
np.unique(input_lens)

array([492])

In [27]:
# np.unique(input_lens)

In [29]:
facts = sample['facts']
question = sample['question']
answer = tokenizer.decode(sample['target_tokens'])

background_text = tokenizer.batch_decode(sample['background_text'])

input_tokens = tokenizer.decode(sample['input_tokens'])

print(f"Facts: {' '.join(facts)}")
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"References: {' '.join(sample['references'])}")
print()
print('Background text: ', ' '.join(background_text))
print('Fact positions: ', sample['fact_positions'])
print('Combined input: ', input_tokens)

print(f"Target: {answer}")


Facts: Bill travelled to the office. Bill picked up the football there. Bill went to the bedroom. Bill gave the football to Fred.
Question: What did Bill give to Fred? 
Answer: football
References: Bill gave the football to Fred.

Background text:  













The Old Testament of the King James Version of the Bible




The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth. 1:2 And the earth was without form, and void; and darkness was upon
the face of the deep. And the Spirit of God moved upon the face of the
waters. 1:3 And God said, Let there be light: and there was light. 1:4 And God saw the light, that it was good: and God divided the light
from the darkness. 1:5 And God called the light Day, and the darkness he called Night. And the evening and the morning were the first day. 1:6 And God said, Let there be a firmament in the midst of the waters,
and let it divide the waters from the waters. 1:7 And God made the firmament, and divid

### collate function

In [20]:
import torch
from torch.nn.utils.rnn import pad_sequence

id_pad_value = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
gen_token = tokenizer.encode('GEN')[0]
eos_token = tokenizer.eos_token_id

def collate_fn(batch):
    targets = [torch.tensor(b['target_tokens']) for b in batch]
    input_ids = [torch.tensor(b['input_tokens'] + b['question_tokens'] + [gen_token] + b['target_tokens'] + [eos_token]) for b in batch]
    gen_inputs = [torch.tensor(b['input_tokens'] + b['question_tokens'] + [gen_token]) for b in batch]

    attention_mask = [torch.ones_like(b, dtype=int) for b in input_ids]
    labels_mask = [torch.zeros_like(b, dtype=bool) for b in input_ids]
    for m, t in zip(labels_mask, targets):
        m[-len(t) - 2:] = True

    input_ids = pad_sequence(input_ids, padding_value=id_pad_value, batch_first=True)
    gen_inputs = pad_sequence(gen_inputs, padding_value=id_pad_value, batch_first=True)
    attention_mask = pad_sequence(attention_mask, padding_value=0, batch_first=True)
    labels_mask = pad_sequence(labels_mask, padding_value=0, batch_first=True)

    collated = {}
    collated['input_ids'] = collated['labels'] = input_ids
    collated['input_ids_generate'] = gen_inputs
    collated['labels_mask'] = labels_mask
    collated['attention_mask'] = attention_mask.bool()
    collated['attention_mask_generate'] = (gen_inputs != id_pad_value).bool()
    collated['target_text'] = [b['answer'] for b in batch]
    return collated

In [31]:
batch = [dataset_train[i] for i in range(10)]
collated = collate_fn(batch)
collated.keys()

dict_keys(['input_ids', 'labels', 'input_ids_generate', 'labels_mask', 'attention_mask', 'attention_mask_generate', 'target_text'])

In [32]:
collated['input_ids'].shape

torch.Size([10, 504])

In [33]:
tokenizer.batch_decode(collated['input_ids'][:, -12:])

['What did Bill give to Fred? GENfootball<|endoftext|><|endoftext|>',
 'Who received the football? GENBill<|endoftext|><|endoftext|><|endoftext|><|endoftext|>',
 'Who received the football? GENBill<|endoftext|><|endoftext|><|endoftext|><|endoftext|>',
 'Who received the milk? GENBill<|endoftext|><|endoftext|><|endoftext|><|endoftext|>',
 'Who gave the milk? GENJeff<|endoftext|><|endoftext|><|endoftext|><|endoftext|>',
 'Who received the milk? GENJeff<|endoftext|><|endoftext|><|endoftext|><|endoftext|>',
 'Who did Mary give the milk to? GENJeff<|endoftext|>',
 'Who gave the milk? GENMary<|endoftext|><|endoftext|><|endoftext|><|endoftext|>',
 'Who did Jeff give the milk to? GENBill<|endoftext|>',
 'Who received the milk? GENBill<|endoftext|><|endoftext|><|endoftext|><|endoftext|>']

In [39]:
tokenizer.batch_decode([c[m] for c, m in zip(collated['input_ids'], collated['labels_mask'])])

['GENfootball<|endoftext|>',
 'GENBill<|endoftext|>',
 'GENBill<|endoftext|>',
 'GENBill<|endoftext|>',
 'GENJeff<|endoftext|>',
 'GENJeff<|endoftext|>',
 'GENJeff<|endoftext|>',
 'GENMary<|endoftext|>',
 'GENBill<|endoftext|>',
 'GENBill<|endoftext|>']

In [40]:
tokenizer.batch_decode([c[m][-12:] for c, m in zip(collated['input_ids'], collated['attention_mask'])])

['.What did Bill give to Fred? GENfootball<|endoftext|>',
 ' sup, andWho received the football? GENBill<|endoftext|>',
 ', that lightWho received the football? GENBill<|endoftext|>',
 'versary.Who received the milk? GENBill<|endoftext|>',
 ' to Bill.Who gave the milk? GENJeff<|endoftext|>',
 'ocked, andWho received the milk? GENJeff<|endoftext|>',
 'Who did Mary give the milk to? GENJeff<|endoftext|>',
 ' the garden.Who gave the milk? GENMary<|endoftext|>',
 'Who did Jeff give the milk to? GENBill<|endoftext|>',
 ' say, WhyWho received the milk? GENBill<|endoftext|>']

In [41]:
tokenizer.batch_decode([c[m][-12:] for c, m in zip(collated['input_ids_generate'], collated['attention_mask_generate'])])

[' to Fred.What did Bill give to Fred? GEN',
 ' I may sup, andWho received the football? GEN',
 ' the lightning, that lightWho received the football? GEN',
 '\nadversary.Who received the milk? GEN',
 ' the milk to Bill.Who gave the milk? GEN',
 '\nmocked, andWho received the milk? GEN',
 ' that isWho did Mary give the milk to? GEN',
 'ed to the garden.Who gave the milk? GEN',
 '\nofWho did Jeff give the milk to? GEN',
 ' he will say, WhyWho received the milk? GEN']

In [42]:
tokenizer.batch_decode(collated['input_ids_generate'][:, -12:])

[' Fred.What did Bill give to Fred? GEN<|endoftext|>',
 ', andWho received the football? GEN<|endoftext|><|endoftext|><|endoftext|>',
 ' that lightWho received the football? GEN<|endoftext|><|endoftext|><|endoftext|>',
 'ary.Who received the milk? GEN<|endoftext|><|endoftext|><|endoftext|>',
 ' Bill.Who gave the milk? GEN<|endoftext|><|endoftext|><|endoftext|>',
 ', andWho received the milk? GEN<|endoftext|><|endoftext|><|endoftext|>',
 ' that isWho did Mary give the milk to? GEN',
 ' garden.Who gave the milk? GEN<|endoftext|><|endoftext|><|endoftext|>',
 '\nofWho did Jeff give the milk to? GEN',
 ', WhyWho received the milk? GEN<|endoftext|><|endoftext|><|endoftext|>']

### test a model

In [43]:
from transformers import AutoModelForCausalLM
from modeling_rmt.language_modeling import MemoryCell, RecurrentWrapper
from modeling_rmt.experimental import MemoryCellGenerate

In [52]:
model = AutoModelForCausalLM.from_pretrained('gpt2')
cell = MemoryCellGenerate(model, num_mem_tokens=16)
model = RecurrentWrapper(cell, 
                        segment_size=512,
                        max_n_segments=1, 
                        segment_alignment=None,
                        k2=-1,
)
                            

## load cpt of rmt
# cpt_path = "/home/jovyan/rmt/runs/babilong/qa1_single-supporting-fact/gpt2/linear_adamw_wd1e-03_64x512_mem16_bs64_bptt--1_from_cpt_32-64/run_4/model_best/pytorch_model.bin"
cpt_path = "/home/jovyan/rmt/runs/babilong/qa5_three-arg-relations/gpt2/linear_adamw_wd1e-03_1x512_mem16_bs64_bptt--1_from_cpt_0-1/run_4/model_best/pytorch_model.bin"
cpt = torch.load(cpt_path, map_location='cpu')
model.load_state_dict(cpt, strict=False)

<All keys matched successfully>

In [55]:
# input_ids_generate = collated.pop('input_ids_generate')
# attention_mask_generate = collated.pop('attention_mask_generate')
# target_text = collated.pop('target_text')

In [94]:
out = model(**collated)

In [95]:
out.loss

tensor(1.1576e-05, grad_fn=<NllLossBackward0>)

In [58]:
generated = model.generate(input_ids_generate, attention_mask=attention_mask_generate, max_new_tokens=10, pad_token_id=id_pad_value)

In [59]:
target_text

['football',
 'Bill',
 'Bill',
 'Bill',
 'Jeff',
 'Jeff',
 'Jeff',
 'Mary',
 'Bill',
 'Bill']

In [99]:
tokenizer.decode(input_ids[0][attention_mask[0]])

"16:12 And if\nye have not been faithful in that which is another man's, who shall\ngive you that which is your own?16:13 No servant can serve two\nmasters: for either he will hate the one, and love the other; or else\nhe will hold to the one, and despise the other.Ye cannot serve God\nand mammon.Bill travelled to the office.Bill picked up the football there.16:14 And the Pharisees also, who were covetous, heard all these\nthings: and they derided him.Bill went to the bedroom.16:15 And he said unto them, Ye are they which justify yourselves\nbefore men; but God knoweth your hearts: for that which is highly\nesteemed among men is abomination in the sight of God.16:16 The law and the prophets were until John: since that time the\nkingdom of God is preached, and every man presseth into it.16:17 And it is easier for heaven and earth to pass, than one tittle\nof the law to fail.16:18 Whosoever putteth away his wife, and marrieth another,\ncommitteth adultery: and whosoever marrieth her that

In [113]:
self = model 
input_ids = input_ids_generate[1:2, :-3]
attention_mask = attention_mask_generate[1:2, :-3]
generate_kwargs = {"max_new_tokens":10, "pad_token_id":id_pad_value}


memory_state = None
segmented = self.segment(input_ids=input_ids, attention_mask=attention_mask)

# print('\n\n\nGenerate: ', [s['input_ids'].shape for s in segmented])
for seg_num, segment in enumerate(segmented[:-1]):
    cell_out, memory_state = self.memory_cell(**segment, memory_state=memory_state, output_hidden_states=True)

final_segment = segmented[-1]
out = self.memory_cell.generate(**final_segment, memory_state=memory_state, **generate_kwargs)

In [114]:
tokenizer.decode(final_segment['input_ids'][0])

"16:26 And beside all this, between us and you there is a great gulf\nfixed: so that they which would pass from hence to you cannot; neither\ncan they pass to us, that would come from thence.Bill travelled to the office.16:27 Then he said, I pray thee therefore, father, that thou wouldest\nsend him to my father's house: 16:28 For I have five brethren; that he\nmay testify unto them, lest they also come into this place of torment.16:29 Abraham saith unto him, They have Moses and the prophets; let\nthem hear them.Bill picked up the football there.16:30 And he said, Nay, father Abraham: but if one went unto them from\nthe dead, they will repent.16:31 And he said unto him, If they hear not Moses and the prophets,\nneither will they be persuaded, though one rose from the dead.17:1 Then said he unto the disciples, It is impossible but that\noffences will come: but woe unto him, through whom they come!17:2 It\nwere better for him that a millstone were hanged about his neck, and\nhe cast into 

In [115]:
tokenizer.batch_decode(out)

['<|endoftext|>Bill<|endoftext|>']

In [17]:
segments = torch.split(collated['input_ids_generate'], 512, dim=1)

In [18]:
[s.shape for s in segments]

[torch.Size([10, 512]), torch.Size([10, 6])]

In [70]:
segments[-1][6]

tensor([ 6365,    11,   287,   262,   717,  1110,   286,   262,   198, 43556,
         1227,    13,  2091,    25,  2670,   843, 12139,   373,   281,  3470,
          290,  8208,   290,  1115,   812,  1468,   618,   339,   198,    67,
          798,   287,  3817,  6075,    13,  2091,    25,  1821,   843,  5822,
          943,   324,   262, 47047,   578,    11,   543, 43756,  2120,   287,
          262,  5366,   287,   262,   198,  1044,   286, 47047,    11,  2982,
          286,   262,  2406,   286,   262,  1751,   286,  2692,    13,   464,
        14043,   318,  7421,   286,   262, 11376,    13,  2091,    25,  3901,
          843,   484, 24057,   422,  3817,  6075,    11,   290, 21730,   287,
         1168,   282,  2144,   993,    13,  2091,    25,  3682,   843,   484,
        24057,   422,  1168,   282,  2144,   993,    11,   290, 21730,   287,
        14944,   261,    13,  2091,    25,  3559,   843,   484, 24057,   422,
        14944,   261,    11,   290, 21730,   287,  1835,   849, 

In [68]:
tokenizer.batch_decode(segments[-1])

[' of Israel, two hundred and fifty princes of\nthe assembly, famous in the congregation, men of renown: 16:3 And they\ngathered themselves together against Moses and against Aaron, and said\nunto them, Ye take too much upon you, seeing all the congregation are\nholy, every one of them, and the LORD is among them: wherefore then\nlift ye up yourselves above the congregation of the LORD?16:4 And\nwhen Moses heard it, he fell upon his face: 16:5 And he spake unto\nKorah and unto all his company, saying, Even to morrow the LORD will\nshew who are his, and who is holy; and will cause him to come near\nunto him: even him whom he hath chosen will he cause to come near unto\nhim.16:6 This do; Take you censers, Korah, and all his company; 16:7 And\nput fire therein, and put incense in them before the LORD to morrow:\nand it shall be that the man whom the LORD doth choose, he shall be\nholy: ye take too much upon you, ye sons of Levi.16:8 And Moses said unto Korah, Hear, IWhat is north of the k

In [19]:
# labels are marked with labels_mask
tokenizer.batch_decode([c[m] for c, m in zip(collated['input_ids'], collated['labels_mask'])])

['GENoffice<|endoftext|>',
 'GENkitchen<|endoftext|>',
 'GENbedroom<|endoftext|>',
 'GENbedroom<|endoftext|>',
 'GENkitchen<|endoftext|>',
 'GENbedroom<|endoftext|>',
 'GENgarden<|endoftext|>',
 'GENbathroom<|endoftext|>',
 'GENoffice<|endoftext|>',
 'GENbedroom<|endoftext|>']

In [15]:
# different input_ids for .forward() and .generate()
tokenizer.batch_decode([c[m] for c, m in zip(collated['input_ids'], collated['attention_mask'])])

["10:3 And the sons of Gomer; Ashkenaz, and Riphath, and Togarmah.10:4 And the sons of Javan; Elishah, and Tarshish, Kittim, and\nDodanim.10:5 By these were the isles of the Gentiles divided in their lands;\nevery one after his tongue, after their families, in their nations.10:6 And the sons of Ham; Cush, and Mizraim, and Phut, and Canaan.10:7 And the sons of Cush; Seba, and Havilah, and Sabtah, and Raamah,\nand Sabtechah: and the sons of Raamah; Sheba, and Dedan.10:8 And Cush begat Nimrod: he began to be a mighty one in the earth.10:9 He was a mighty hunter before the LORD: wherefore it is said,\nEven as Nimrod the mighty hunter before the LORD.10:10 And the beginning of his kingdom was Babel, and Erech, and\nAccad, and Calneh, in the land of Shinar.10:11 Out of that land went forth Asshur, and builded Nineveh, and the\ncity Rehoboth, and Calah, 10:12 And Resen between Nineveh and Calah:\nthe same is a great city.10:13 And Mizraim begat Ludim, and Anamim, and Lehabim, and Naphtuhim,\n

In [16]:
tokenizer.batch_decode([c[m] for c, m in zip(collated['input_ids_generate'], collated['attention_mask_generate'])])

["10:3 And the sons of Gomer; Ashkenaz, and Riphath, and Togarmah.10:4 And the sons of Javan; Elishah, and Tarshish, Kittim, and\nDodanim.10:5 By these were the isles of the Gentiles divided in their lands;\nevery one after his tongue, after their families, in their nations.10:6 And the sons of Ham; Cush, and Mizraim, and Phut, and Canaan.10:7 And the sons of Cush; Seba, and Havilah, and Sabtah, and Raamah,\nand Sabtechah: and the sons of Raamah; Sheba, and Dedan.10:8 And Cush begat Nimrod: he began to be a mighty one in the earth.10:9 He was a mighty hunter before the LORD: wherefore it is said,\nEven as Nimrod the mighty hunter before the LORD.10:10 And the beginning of his kingdom was Babel, and Erech, and\nAccad, and Calneh, in the land of Shinar.10:11 Out of that land went forth Asshur, and builded Nineveh, and the\ncity Rehoboth, and Calah, 10:12 And Resen between Nineveh and Calah:\nthe same is a great city.10:13 And Mizraim begat Ludim, and Anamim, and Lehabim, and Naphtuhim,\n

### Create a dataloader

In [14]:
from torch.utils.data import DataLoader

dl = DataLoader(batch_size=2, dataset=dataset_train, collate_fn=collate_fn)
gen = iter(dl)
batch = next(gen)
batch.keys()

dict_keys(['input_ids', 'labels', 'input_ids_generate', 'labels_mask', 'attention_mask', 'attention_mask_generate', 'target_text'])