### **Config**

In [1]:
import os
import sys

sys.path.insert(0, '/home/marco/epfl/magma/')
import config

In [2]:
# General configurations

MODEL = 'pegasus'

RE_SPLITTER = '\.(?!\d)|\n'     # do we split sentences of paragraphs?
                                # use '\.|\n' or '\n', respectively

# Output path
OUTPUT_PATH = config.MAGMA_DIR+'datasets/merge_or_chunk/'+MODEL+'/'
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

### **Init**

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import re
import pandas as pd
import gensim

if 'pegasus' in MODEL:
    from transformers import PegasusTokenizer
    tokenizer =\
        PegasusTokenizer.from_pretrained('google/pegasus-large')
elif 'bart' in MODEL:
    from transformers import BartTokenizer
    tokenizer =\
        BartTokenizer.from_pretrained('facebook/bart-large-cnn')
elif 't5' in MODEL:
    from transformers import T5Tokenizer
    tokenizer=\
        T5Tokenizer.from_pretrained('t5-large')

### **Karger Books Base Dataset**

In [6]:
base_dataset = config.MAGMA_DIR+'datasets/karger_books_base/df.csv'
df = pd.read_csv(base_dataset)
df = df.set_index(['book', 'chapter', 'section', 'subsection'])
df.bullets = df.bullets.map(eval, na_action='ignore')

### **Preprocessing**

#### Preprocessing

* Split based on RE_SPLITTER
* Explode the dataset
* Remove unwanted chars at beginning or end of sentence
* Remove multiple spaces
* Remove long words (> config.TOKEN_MAX_LEN chars)
* Remove short sentences / paragraphs (< config.PARA_MIN_LEN tokens)

In [7]:
# Split in sentences / paragraphs based on RE_SPLITTER
df.text =\
    df.text.map(lambda x: [p.strip() for p in re.split(RE_SPLITTER, x) if p!=''],
                na_action='ignore')
    
# explode to get one row for each paragraph /sentence
df = df.explode('text')
df = df.rename(columns={'text': 'para'})
df = df.dropna()

# Remove unwanted chars at beginning or end of sentence
df.para = df.para.map(lambda p: p.lstrip('.,;:-)] \n'))
df.para = df.para.map(lambda p: p.rstrip('.,;:-([ \n'))

# Remove multiple spaces
df.para = df.para.map(lambda p:
    re.sub('\s+', ' ', p).strip())

# Remove long words (> config.TOKEN_MAX_LEN chars)
def para2words(para):
    return gensim.utils.simple_preprocess(
        para, deacc=True, max_len=config.TOKEN_MAX_LEN)
df['para_proc'] = df.para.map(para2words)

# Remove short sentences / paragraphs (< config.PARA_MIN_LEN tokens)
df.loc[df.para_proc.map(len) <\
    config.PARA_MIN_LEN, 'para_proc'] = np.nan

df = df.dropna()

### **Merge or Chunk Sections**

In [8]:
df = df.groupby(level=[0,1,2,3], sort=False).agg(
    {'para': lambda t: ''.join([p+'. ' for p in t]),
     'bullets': lambda b: list(b)[0]})
df = df.rename(columns={'para': 'text'})

In [9]:
df['text_enc'] = df.text.map(tokenizer.tokenize)
df['text_num_tokens'] = df.text_enc.map(len)

#### Chunk function

In [10]:
def chunk_text(text, num_tok):
    text_sent =\
        [sent.strip()+'.' for sent in re.split(RE_SPLITTER, text) if len(sent) > 1]

    # calculate number of tokens per sentence
    num_tok_sent = [len(tokenizer.tokenize(sent)) for sent in text_sent]
    
    # calculate chunk dimension to fit into model
    n = int(np.ceil(num_tok / config.MODEL_MAX_LEN))
    len_chunk = int(num_tok / n)
    # get a more uniform splitting to avoid splits
    # which are too short at the end
    if len_chunk+50 > config.MODEL_MAX_LEN:
        len_chunk = int(num_tok / (n+1))
    
    len_curr = 0
    text_curr = []
    text_chunk = []
    for te, len_sent in zip(text_sent, num_tok_sent):

        if len_curr + len_sent < len_chunk:
            text_curr.append(te)
            len_curr += len_sent

        elif len_curr + len_sent >= config.MODEL_MAX_LEN:
            text_chunk.append(text_curr)

            text_curr = [te]
            len_curr = len_sent

        else: # >= len_chunk && < MODEL_MAX_LEN
            text_curr.append(te)
            text_chunk.append(text_curr)
            
            text_curr = []
            len_curr = 0

    if len_curr > 0:
        text_chunk.append(text_curr)

    return text_chunk

#### Merge or Chunk

In [11]:
def chunk_or_merge(df_merge, num_row, num_tok, bid, cpt, i, index):
    print('num_tok MOC', num_tok)

    # if the text fits perfectly, we just add it to df_merge
    if num_tok >= CHUNK_MIN_LEN and num_tok < config.MODEL_MAX_LEN:
        id_merged = index[i-num_row:i]
        print('summ', id_merged)

        data_merged = []
        for c in df.columns:
            if c == 'bullets':
                data_merged.append(df.loc[id_merged, c].iloc[0])
            else:
                data_merged.append(df.loc[id_merged, c].sum())

        new_index = pd.MultiIndex.from_arrays(
            [[bid], [cpt], [str(id_merged)], [False]])
        df_merge_add = pd.DataFrame(
            data = [data_merged],
            index = new_index,
            columns = df.columns)
        df_merge = df_merge.append(df_merge_add)
        return False, df_merge

    # if the text is too long, we chunk it
    elif num_tok >= config.MODEL_MAX_LEN:
        id_merged = index[i-num_row:i]
        print('chunking', id_merged)

        new_index = pd.MultiIndex.from_arrays(
            [[bid], [cpt], [str(id_merged)], [True]])

        text_merged = df.loc[id_merged, 'text'].sum()
        text_chunked = chunk_text(text_merged, num_tok)
        for tc in text_chunked:
            t = ' '.join(tc)
            df_merge_add = pd.DataFrame(
                data = [[t,
                        df.iloc[i-1].bullets,
                        tokenizer.tokenize(t),
                        len(tokenizer.tokenize(t))]],
                index = new_index,
                columns = df.columns)
            df_merge = df_merge.append(df_merge_add)
        return False, df_merge

    # if the text is too short, we merge it
    elif num_tok < CHUNK_MIN_LEN:
        print('skipping', index[i-num_row:i])
        return True, df_merge

#### Create the new dataset

In [12]:
# define new dataset for the chapters
indices = pd.MultiIndex.from_arrays(
    [[], [], [], []],
    names = ['book', 'chapter', 'merge', 'chunk'])
df_moc = pd.DataFrame(index = indices, columns = df.columns.tolist())

# minimum length of a chunk
CHUNK_MIN_LEN = np.ceil(config.BULLETS_MED_LEN / config.MAX_RATIO)
print('Chunk minimum length:', CHUNK_MIN_LEN)
print()

index = df.index.tolist()

i_cpt = 0
for bid in sorted(list(set(df.index.get_level_values(0)))):
    cpts = config.get_cpts(df, bid)
    for cpt in cpts:
        print('book', bid, 'chapter', cpt)
        print()

        merge = False
        num_row, num_row_past = 0, 0
        num_tok, num_tok_past = 0, 0
        max_i = i_cpt + len(df.loc[bid, cpt])

        # for each section in chapter
        for i in range(i_cpt, max_i):
            # if previous section is too short, 
            # we merge it with the one below
            if merge: 
                num_row += 1
                print('merging', num_tok, '+', df.iloc[i].text_num_tokens)
                num_tok += df.iloc[i].text_num_tokens
            else:
                num_row_past = num_row
                num_row = 1
                num_tok_past = num_tok
                num_tok = df.iloc[i].text_num_tokens
            '''
                after merging or considering a new section, we call this function
                to study the length. Then we either add the row to the dataset, chunk it 
                or do nothing if it is still too short
            '''
            merge, df_moc =\
                chunk_or_merge(df_moc, num_row, num_tok, bid, cpt, i+1, index)

            print()
            
            '''
                if the last chunk of the chapter is too small, we merge it
                with the above and repeat one last time the process
            '''
            if merge and (i+1 == max_i):
                print('EXCEPTION, last chunk too small.')
                if num_row_past == 0:
                    _, df_moc =\
                        chunk_or_merge(df_moc, num_row, config.MODEL_MAX_LEN-1, bid, cpt, i+1, index)
                else:
                    num_row += num_row_past
                    num_tok += num_tok_past
                    # drop last row added and re-calculate for new, bigger chunk
                    df_moc = df_moc.drop(df_moc.index[-1])
                    _, df_moc =\
                        chunk_or_merge(df_moc, num_row, num_tok, bid, cpt, i+1, index)
                 
        i_cpt += len(df.loc[bid, cpt])
        print('\n############################')
        print()
        print()

for idx, r in df_moc.groupby(level=[0,1,2,3]).text.apply(list).iteritems():
    idx_merged = eval(idx[2])
    old_txt = ''.join(df.loc[idx_merged].text.tolist()).strip()
    new_txt = ' '.join(r).strip()
    assert old_txt == new_txt

Chunk minimum length: 720.0

book 9781905832729 chapter ch_2

num_tok MOC 11
skipping [(9781905832729, 'ch_2', 'ch_2', 'ch_2')]

merging 11 + 239
num_tok MOC 250
skipping [(9781905832729, 'ch_2', 'ch_2', 'ch_2'), (9781905832729, 'ch_2', '1.i', '1.i')]

merging 250 + 692
num_tok MOC 942
summ [(9781905832729, 'ch_2', 'ch_2', 'ch_2'), (9781905832729, 'ch_2', '1.i', '1.i'), (9781905832729, 'ch_2', '1.ii', '1.ii')]


############################


book 9781905832729 chapter ch_3

num_tok MOC 93
skipping [(9781905832729, 'ch_3', '2.i', '2.i')]

merging 93 + 268
num_tok MOC 361
skipping [(9781905832729, 'ch_3', '2.i', '2.i'), (9781905832729, 'ch_3', '2.ii', '2.ii')]

merging 361 + 488
num_tok MOC 849
summ [(9781905832729, 'ch_3', '2.i', '2.i'), (9781905832729, 'ch_3', '2.ii', '2.ii'), (9781905832729, 'ch_3', '2.iii', '2.iii')]

num_tok MOC 424
skipping [(9781905832729, 'ch_3', '2.iv', '2.iv')]

merging 424 + 867
num_tok MOC 1291
chunking [(9781905832729, 'ch_3', '2.iv', '2.iv'), (978190583272





############################


book 9781905832729 chapter ch_4

num_tok MOC 8
skipping [(9781905832729, 'ch_4', 'ch_4', 'ch_4')]

merging 8 + 240
num_tok MOC 248
skipping [(9781905832729, 'ch_4', 'ch_4', 'ch_4'), (9781905832729, 'ch_4', '3.i', '3.i')]

merging 248 + 339
num_tok MOC 587
skipping [(9781905832729, 'ch_4', 'ch_4', 'ch_4'), (9781905832729, 'ch_4', '3.i', '3.i'), (9781905832729, 'ch_4', '3.ii', '3.ii')]

merging 587 + 287
num_tok MOC 874
summ [(9781905832729, 'ch_4', 'ch_4', 'ch_4'), (9781905832729, 'ch_4', '3.i', '3.i'), (9781905832729, 'ch_4', '3.ii', '3.ii'), (9781905832729, 'ch_4', '3.iii', '3.iii')]

num_tok MOC 62
skipping [(9781905832729, 'ch_4', '3.iv', '3.iv')]

merging 62 + 164
num_tok MOC 226
skipping [(9781905832729, 'ch_4', '3.iv', '3.iv'), (9781905832729, 'ch_4', '3.v', '3.v')]

merging 226 + 2193
num_tok MOC 2419
chunking [(9781905832729, 'ch_4', '3.iv', '3.iv'), (9781905832729, 'ch_4', '3.v', '3.v'), (9781905832729, 'ch_4', '3.vi', '3.vi')]

num_tok MOC 615


############################


book 9781908541024 chapter ch_10

num_tok MOC 194
skipping [(9781908541024, 'ch_10', 'ch_10', 'ch_10')]

merging 194 + 93
num_tok MOC 287
skipping [(9781908541024, 'ch_10', 'ch_10', 'ch_10'), (9781908541024, 'ch_10', '9.i', '9.i')]

merging 287 + 201
num_tok MOC 488
skipping [(9781908541024, 'ch_10', 'ch_10', 'ch_10'), (9781908541024, 'ch_10', '9.i', '9.i'), (9781908541024, 'ch_10', '9.ii', '9.ii')]

merging 488 + 493
num_tok MOC 981
summ [(9781908541024, 'ch_10', 'ch_10', 'ch_10'), (9781908541024, 'ch_10', '9.i', '9.i'), (9781908541024, 'ch_10', '9.ii', '9.ii'), (9781908541024, 'ch_10', '9.iii', '9.iii')]

num_tok MOC 3048
chunking [(9781908541024, 'ch_10', '9.iv', '9.iv')]

num_tok MOC 322
skipping [(9781908541024, 'ch_10', '9.v', '9.v')]

merging 322 + 190
num_tok MOC 512
skipping [(9781908541024, 'ch_10', '9.v', '9.v'), (9781908541024, 'ch_10', '9.vi', '9.vi')]

merging 512 + 178
num_tok MOC 690
skipping [(9781908541024, 'ch_10', '9.v', '9.v'), (9781



############################


book 9781908541062 chapter ch_7

num_tok MOC 51
skipping [(9781908541062, 'ch_7', 'ch_7', 'ch_7')]

merging 51 + 854
num_tok MOC 905
summ [(9781908541062, 'ch_7', 'ch_7', 'ch_7'), (9781908541062, 'ch_7', '6.i', '6.i')]

num_tok MOC 882
summ [(9781908541062, 'ch_7', '6.ii', '6.ii')]

num_tok MOC 226
skipping [(9781908541062, 'ch_7', '6.iii', '6.iii')]

merging 226 + 263
num_tok MOC 489
skipping [(9781908541062, 'ch_7', '6.iii', '6.iii'), (9781908541062, 'ch_7', '6.iv', '6.iv')]

EXCEPTION, last chunk too small.
num_tok MOC 1371
chunking [(9781908541062, 'ch_7', '6.ii', '6.ii'), (9781908541062, 'ch_7', '6.iii', '6.iii'), (9781908541062, 'ch_7', '6.iv', '6.iv')]

############################


book 9781908541062 chapter ch_8

num_tok MOC 157
skipping [(9781908541062, 'ch_8', 'ch_8', 'ch_8')]

merging 157 + 502
num_tok MOC 659
skipping [(9781908541062, 'ch_8', 'ch_8', 'ch_8'), (9781908541062, 'ch_8', '7.i', '7.i')]

merging 659 + 2082
num_tok MOC 2741
chunk


############################


book 9781908541086 chapter ch_9

num_tok MOC 99
skipping [(9781908541086, 'ch_9', 'ch_9', 'ch_9')]

merging 99 + 213
num_tok MOC 312
skipping [(9781908541086, 'ch_9', 'ch_9', 'ch_9'), (9781908541086, 'ch_9', '7.i', '7.i')]

merging 312 + 1106
num_tok MOC 1418
chunking [(9781908541086, 'ch_9', 'ch_9', 'ch_9'), (9781908541086, 'ch_9', '7.i', '7.i'), (9781908541086, 'ch_9', '7.ii', '7.ii')]

num_tok MOC 1372
chunking [(9781908541086, 'ch_9', '7.iii', '7.iii')]

num_tok MOC 287
skipping [(9781908541086, 'ch_9', '7.iv', '7.iv')]

merging 287 + 109
num_tok MOC 396
skipping [(9781908541086, 'ch_9', '7.iv', '7.iv'), (9781908541086, 'ch_9', '7.v', '7.v')]

EXCEPTION, last chunk too small.
num_tok MOC 1768
chunking [(9781908541086, 'ch_9', '7.iii', '7.iii'), (9781908541086, 'ch_9', '7.iv', '7.iv'), (9781908541086, 'ch_9', '7.v', '7.v')]

############################


book 9781908541086 chapter ch_10

num_tok MOC 7
skipping [(9781908541086, 'ch_10', 'ch_10', 'ch_1


############################


book 9781908541277 chapter ch_9

num_tok MOC 7
skipping [(9781908541277, 'ch_9', 'ch_9', 'ch_9')]

merging 7 + 1050
num_tok MOC 1057
chunking [(9781908541277, 'ch_9', 'ch_9', 'ch_9'), (9781908541277, 'ch_9', 'sec0', 'sec0')]

num_tok MOC 743
summ [(9781908541277, 'ch_9', 'sec1', 'sec1')]

num_tok MOC 295
skipping [(9781908541277, 'ch_9', 'sec2', 'sec2')]

merging 295 + 575
num_tok MOC 870
summ [(9781908541277, 'ch_9', 'sec2', 'sec2'), (9781908541277, 'ch_9', 'sec3', 'sec3')]

num_tok MOC 332
skipping [(9781908541277, 'ch_9', 'sec4', 'sec4')]

EXCEPTION, last chunk too small.
num_tok MOC 1202
chunking [(9781908541277, 'ch_9', 'sec2', 'sec2'), (9781908541277, 'ch_9', 'sec3', 'sec3'), (9781908541277, 'ch_9', 'sec4', 'sec4')]

############################


book 9781908541277 chapter ch_10

num_tok MOC 351
skipping [(9781908541277, 'ch_10', 'ch_10', 'ch_10')]

merging 351 + 128
num_tok MOC 479
skipping [(9781908541277, 'ch_10', 'ch_10', 'ch_10'), (9781908541


############################


book 9781908541406 chapter ch_11

num_tok MOC 6
skipping [(9781908541406, 'ch_11', 'ch_11', 'ch_11')]

merging 6 + 2683
num_tok MOC 2689
chunking [(9781908541406, 'ch_11', 'ch_11', 'ch_11'), (9781908541406, 'ch_11', 'sec0', 'sec0')]

num_tok MOC 1287
chunking [(9781908541406, 'ch_11', 'sec1', 'sec1')]


############################


book 9781908541406 chapter ch_12

num_tok MOC 294
skipping [(9781908541406, 'ch_12', 'ch_12', 'ch_12')]

merging 294 + 184
num_tok MOC 478
skipping [(9781908541406, 'ch_12', 'ch_12', 'ch_12'), (9781908541406, 'ch_12', 'sec0', 'sec0')]

merging 478 + 574
num_tok MOC 1052
chunking [(9781908541406, 'ch_12', 'ch_12', 'ch_12'), (9781908541406, 'ch_12', 'sec0', 'sec0'), (9781908541406, 'ch_12', 'sec1', 'sec1')]

num_tok MOC 334
skipping [(9781908541406, 'ch_12', 'sec2', 'sec2')]

merging 334 + 323
num_tok MOC 657
skipping [(9781908541406, 'ch_12', 'sec2', 'sec2'), (9781908541406, 'ch_12', 'sec3', 'sec3')]

merging 657 + 624
num_to


############################


book 9781908541420 chapter ch_9

num_tok MOC 49
skipping [(9781908541420, 'ch_9', 'ch_9', 'ch_9')]

merging 49 + 90
num_tok MOC 139
skipping [(9781908541420, 'ch_9', 'ch_9', 'ch_9'), (9781908541420, 'ch_9', 'sec0', 'sec0')]

merging 139 + 932
num_tok MOC 1071
chunking [(9781908541420, 'ch_9', 'ch_9', 'ch_9'), (9781908541420, 'ch_9', 'sec0', 'sec0'), (9781908541420, 'ch_9', 'sec1', 'sec1')]

num_tok MOC 245
skipping [(9781908541420, 'ch_9', 'sec2', 'sec2')]

merging 245 + 300
num_tok MOC 545
skipping [(9781908541420, 'ch_9', 'sec2', 'sec2'), (9781908541420, 'ch_9', 'sec3', 'sec3')]

merging 545 + 416
num_tok MOC 961
summ [(9781908541420, 'ch_9', 'sec2', 'sec2'), (9781908541420, 'ch_9', 'sec3', 'sec3'), (9781908541420, 'ch_9', 'sec4', 'sec4')]

num_tok MOC 453
skipping [(9781908541420, 'ch_9', 'sec5', 'sec5')]

merging 453 + 129
num_tok MOC 582
skipping [(9781908541420, 'ch_9', 'sec5', 'sec5'), (9781908541420, 'ch_9', 'sec6', 'sec6')]

EXCEPTION, last chun


num_tok MOC 147
skipping [(9781908541468, 'ch_11', 'sec2', 'sec2')]

merging 147 + 2401
num_tok MOC 2548
chunking [(9781908541468, 'ch_11', 'sec2', 'sec2'), (9781908541468, 'ch_11', 'sec3', 'sec3')]


############################


book 9781908541468 chapter ch_12

num_tok MOC 370
skipping [(9781908541468, 'ch_12', 'ch_12', 'ch_12')]

merging 370 + 109
num_tok MOC 479
skipping [(9781908541468, 'ch_12', 'ch_12', 'ch_12'), (9781908541468, 'ch_12', 'sec0', 'sec0')]

merging 479 + 434
num_tok MOC 913
summ [(9781908541468, 'ch_12', 'ch_12', 'ch_12'), (9781908541468, 'ch_12', 'sec0', 'sec0'), (9781908541468, 'ch_12', 'sec1', 'sec1')]

num_tok MOC 241
skipping [(9781908541468, 'ch_12', 'sec2', 'sec2')]

merging 241 + 252
num_tok MOC 493
skipping [(9781908541468, 'ch_12', 'sec2', 'sec2'), (9781908541468, 'ch_12', 'sec3', 'sec3')]

merging 493 + 297
num_tok MOC 790
summ [(9781908541468, 'ch_12', 'sec2', 'sec2'), (9781908541468, 'ch_12', 'sec3', 'sec3'), (9781908541468, 'ch_12', 'sec4', 'sec4')


num_tok MOC 305
skipping [(9781908541666, 'ch_7', 'sec10', 'sec10')]

EXCEPTION, last chunk too small.
num_tok MOC 1446
chunking [(9781908541666, 'ch_7', 'sec8', 'sec8'), (9781908541666, 'ch_7', 'sec9', 'sec9'), (9781908541666, 'ch_7', 'sec10', 'sec10')]

############################


book 9781908541666 chapter ch_8

num_tok MOC 228
skipping [(9781908541666, 'ch_8', 'ch_8', 'ch_8')]

merging 228 + 291
num_tok MOC 519
skipping [(9781908541666, 'ch_8', 'ch_8', 'ch_8'), (9781908541666, 'ch_8', 'sec0', 'sec0')]

merging 519 + 287
num_tok MOC 806
summ [(9781908541666, 'ch_8', 'ch_8', 'ch_8'), (9781908541666, 'ch_8', 'sec0', 'sec0'), (9781908541666, 'ch_8', 'sec1', 'sec1')]

num_tok MOC 33
skipping [(9781908541666, 'ch_8', 'sec2', 'sec2')]

merging 33 + 377
num_tok MOC 410
skipping [(9781908541666, 'ch_8', 'sec2', 'sec2'), (9781908541666, 'ch_8', 'sec3', 'sec3')]

merging 410 + 1532
num_tok MOC 1942
chunking [(9781908541666, 'ch_8', 'sec2', 'sec2'), (9781908541666, 'ch_8', 'sec3', 'sec3'),


num_tok MOC 2758
chunking [(9781908541680, 'ch_4', 'sec1', 'sec1')]

num_tok MOC 480
skipping [(9781908541680, 'ch_4', 'sec2', 'sec2')]

merging 480 + 883
num_tok MOC 1363
chunking [(9781908541680, 'ch_4', 'sec2', 'sec2'), (9781908541680, 'ch_4', 'sec3', 'sec3')]

num_tok MOC 237
skipping [(9781908541680, 'ch_4', 'sec4', 'sec4')]

merging 237 + 158
num_tok MOC 395
skipping [(9781908541680, 'ch_4', 'sec4', 'sec4'), (9781908541680, 'ch_4', 'sec5', 'sec5')]

merging 395 + 1115
num_tok MOC 1510
chunking [(9781908541680, 'ch_4', 'sec4', 'sec4'), (9781908541680, 'ch_4', 'sec5', 'sec5'), (9781908541680, 'ch_4', 'sec6', 'sec6')]

num_tok MOC 117
skipping [(9781908541680, 'ch_4', 'sec7', 'sec7')]

EXCEPTION, last chunk too small.
num_tok MOC 1627
chunking [(9781908541680, 'ch_4', 'sec4', 'sec4'), (9781908541680, 'ch_4', 'sec5', 'sec5'), (9781908541680, 'ch_4', 'sec6', 'sec6'), (9781908541680, 'ch_4', 'sec7', 'sec7')]

############################


book 9781908541680 chapter ch_5

num_tok MOC 

skipping [(9781908541703, 'ch_3', 'ch_3', 'ch_3'), (9781908541703, 'ch_3', 'sec0', 'sec0')]

merging 644 + 1357
num_tok MOC 2001
chunking [(9781908541703, 'ch_3', 'ch_3', 'ch_3'), (9781908541703, 'ch_3', 'sec0', 'sec0'), (9781908541703, 'ch_3', 'sec1', 'sec1')]

num_tok MOC 197
skipping [(9781908541703, 'ch_3', 'sec2', 'sec2')]

merging 197 + 657
num_tok MOC 854
summ [(9781908541703, 'ch_3', 'sec2', 'sec2'), (9781908541703, 'ch_3', 'sec3', 'sec3')]


############################


book 9781908541703 chapter ch_4

num_tok MOC 742
summ [(9781908541703, 'ch_4', 'sec0', 'sec0')]

num_tok MOC 1508
chunking [(9781908541703, 'ch_4', 'sec1', 'sec1')]

num_tok MOC 311
skipping [(9781908541703, 'ch_4', 'sec2', 'sec2')]

merging 311 + 580
num_tok MOC 891
summ [(9781908541703, 'ch_4', 'sec2', 'sec2'), (9781908541703, 'ch_4', 'sec3', 'sec3')]

num_tok MOC 747
summ [(9781908541703, 'ch_4', 'sec4', 'sec4')]

num_tok MOC 650
skipping [(9781908541703, 'ch_4', 'sec5', 'sec5')]

merging 650 + 772
num_tok


num_tok MOC 1578
chunking [(9781908541727, 'ch03', 'sect11', 'sect11')]

num_tok MOC 623
skipping [(9781908541727, 'ch03', 'sect12', 'sect12')]

EXCEPTION, last chunk too small.
num_tok MOC 2201
chunking [(9781908541727, 'ch03', 'sect11', 'sect11'), (9781908541727, 'ch03', 'sect12', 'sect12')]

############################


book 9781908541727 chapter ch04

num_tok MOC 83
skipping [(9781908541727, 'ch04', 'ch04', 'ch04')]

merging 83 + 1502
num_tok MOC 1585
chunking [(9781908541727, 'ch04', 'ch04', 'ch04'), (9781908541727, 'ch04', 'sect13', 'sect13')]


############################


book 9781908541727 chapter ch05

num_tok MOC 44
skipping [(9781908541727, 'ch05', 'ch05', 'ch05')]

merging 44 + 1074
num_tok MOC 1118
chunking [(9781908541727, 'ch05', 'ch05', 'ch05'), (9781908541727, 'ch05', 'sect14', 'sect14')]


############################


book 9781908541727 chapter ch06

num_tok MOC 36
skipping [(9781908541727, 'ch06', 'ch06', 'ch06')]

merging 36 + 54
num_tok MOC 90
skipping [(97


num_tok MOC 583
skipping [(9781908541796, 'chapter3', 'sect18', 'sect18')]

merging 583 + 306
num_tok MOC 889
summ [(9781908541796, 'chapter3', 'sect18', 'sect18'), (9781908541796, 'chapter3', 'sect19', 'sect19')]

num_tok MOC 960
summ [(9781908541796, 'chapter3', 'sect20', 'sect20')]


############################


book 9781908541796 chapter chapter4

num_tok MOC 140
skipping [(9781908541796, 'chapter4', 'chapter4', 'chapter4')]

merging 140 + 225
num_tok MOC 365
skipping [(9781908541796, 'chapter4', 'chapter4', 'chapter4'), (9781908541796, 'chapter4', 'sect22', 'sect22')]

merging 365 + 56
num_tok MOC 421
skipping [(9781908541796, 'chapter4', 'chapter4', 'chapter4'), (9781908541796, 'chapter4', 'sect22', 'sect22'), (9781908541796, 'chapter4', 'sect23', 'sect23')]

merging 421 + 903
num_tok MOC 1324
chunking [(9781908541796, 'chapter4', 'chapter4', 'chapter4'), (9781908541796, 'chapter4', 'sect22', 'sect22'), (9781908541796, 'chapter4', 'sect23', 'sect23'), (9781908541796, 'chapter4


num_tok MOC 459
skipping [(9781908541901, 'ch_5', 'sec1', 'sec1')]

EXCEPTION, last chunk too small.
num_tok MOC 1458
chunking [(9781908541901, 'ch_5', 'ch_5', 'ch_5'), (9781908541901, 'ch_5', 'sec0', 'sec0'), (9781908541901, 'ch_5', 'sec1', 'sec1')]

############################


book 9781908541901 chapter ch_6

num_tok MOC 234
skipping [(9781908541901, 'ch_6', 'ch_6', 'ch_6')]

merging 234 + 220
num_tok MOC 454
skipping [(9781908541901, 'ch_6', 'ch_6', 'ch_6'), (9781908541901, 'ch_6', 'sec0', 'sec0')]

merging 454 + 66
num_tok MOC 520
skipping [(9781908541901, 'ch_6', 'ch_6', 'ch_6'), (9781908541901, 'ch_6', 'sec0', 'sec0'), (9781908541901, 'ch_6', 'sec1', 'sec1')]

merging 520 + 548
num_tok MOC 1068
chunking [(9781908541901, 'ch_6', 'ch_6', 'ch_6'), (9781908541901, 'ch_6', 'sec0', 'sec0'), (9781908541901, 'ch_6', 'sec1', 'sec1'), (9781908541901, 'ch_6', 'sec2', 'sec2')]

num_tok MOC 362
skipping [(9781908541901, 'ch_6', 'sec3', 'sec3')]

merging 362 + 179
num_tok MOC 541
skipping 


############################


book 9781908541963 chapter chapter7

num_tok MOC 117
skipping [(9781908541963, 'chapter7', 'chapter7', 'chapter7')]

merging 117 + 1194
num_tok MOC 1311
chunking [(9781908541963, 'chapter7', 'chapter7', 'chapter7'), (9781908541963, 'chapter7', 'sect29', 'sect29')]

num_tok MOC 1167
chunking [(9781908541963, 'chapter7', 'sect30', 'sect30')]


############################


book 9781908541963 chapter chapter8

num_tok MOC 86
skipping [(9781908541963, 'chapter8', 'chapter8', 'chapter8')]

merging 86 + 1041
num_tok MOC 1127
chunking [(9781908541963, 'chapter8', 'chapter8', 'chapter8'), (9781908541963, 'chapter8', 'sect32', 'sect32')]

num_tok MOC 151
skipping [(9781908541963, 'chapter8', 'sect33', 'sect33')]

merging 151 + 842
num_tok MOC 993
summ [(9781908541963, 'chapter8', 'sect33', 'sect33'), (9781908541963, 'chapter8', 'sect34', 'sect34')]

num_tok MOC 730
summ [(9781908541963, 'chapter8', 'sect35', 'sect35')]


############################


book 97819


num_tok MOC 545
skipping [(9781908541994, 'ch07', 'sect35', 'sect35')]

EXCEPTION, last chunk too small.
num_tok MOC 1356
chunking [(9781908541994, 'ch07', 'ch07', 'ch07'), (9781908541994, 'ch07', 'sect33', 'sect33'), (9781908541994, 'ch07', 'sect34', 'sect34'), (9781908541994, 'ch07', 'sect35', 'sect35')]

############################


book 9781908541994 chapter ch08

num_tok MOC 6
skipping [(9781908541994, 'ch08', 'ch08', 'ch08')]

merging 6 + 185
num_tok MOC 191
skipping [(9781908541994, 'ch08', 'ch08', 'ch08'), (9781908541994, 'ch08', 'sect37', 'sect37')]

merging 191 + 585
num_tok MOC 776
summ [(9781908541994, 'ch08', 'ch08', 'ch08'), (9781908541994, 'ch08', 'sect37', 'sect37'), (9781908541994, 'ch08', 'sect38', 'sect38')]

num_tok MOC 243
skipping [(9781908541994, 'ch08', 'sect39', 'sect39')]

merging 243 + 1015
num_tok MOC 1258
chunking [(9781908541994, 'ch08', 'sect39', 'sect39'), (9781908541994, 'ch08', 'sect40', 'sect40')]

num_tok MOC 809
summ [(9781908541994, 'ch08', 'sec


num_tok MOC 236
skipping [(9781910797006, 'ch08', 'sect53', 'sect53')]

merging 236 + 2039
num_tok MOC 2275
chunking [(9781910797006, 'ch08', 'sect53', 'sect53'), (9781910797006, 'ch08', 'sect54', 'sect54')]

num_tok MOC 227
skipping [(9781910797006, 'ch08', 'sect55', 'sect55')]

merging 227 + 209
num_tok MOC 436
skipping [(9781910797006, 'ch08', 'sect55', 'sect55'), (9781910797006, 'ch08', 'sect56', 'sect56')]

EXCEPTION, last chunk too small.
num_tok MOC 2711
chunking [(9781910797006, 'ch08', 'sect53', 'sect53'), (9781910797006, 'ch08', 'sect54', 'sect54'), (9781910797006, 'ch08', 'sect55', 'sect55'), (9781910797006, 'ch08', 'sect56', 'sect56')]

############################


book 9781910797082 chapter ch01

num_tok MOC 10
skipping [(9781910797082, 'ch01', 'ch01', 'ch01')]

merging 10 + 682
num_tok MOC 692
skipping [(9781910797082, 'ch01', 'ch01', 'ch01'), (9781910797082, 'ch01', 'sect1', 'sect1')]

merging 692 + 1869
num_tok MOC 2561
chunking [(9781910797082, 'ch01', 'ch01', 'ch01


############################


book 9781910797105 chapter ch01

num_tok MOC 4
skipping [(9781910797105, 'ch01', 'ch01', 'ch01')]

merging 4 + 188
num_tok MOC 192
skipping [(9781910797105, 'ch01', 'ch01', 'ch01'), (9781910797105, 'ch01', 'sect1', 'sect1')]

merging 192 + 128
num_tok MOC 320
skipping [(9781910797105, 'ch01', 'ch01', 'ch01'), (9781910797105, 'ch01', 'sect1', 'sect1'), (9781910797105, 'ch01', 'sect2', 'sect2')]

merging 320 + 739
num_tok MOC 1059
chunking [(9781910797105, 'ch01', 'ch01', 'ch01'), (9781910797105, 'ch01', 'sect1', 'sect1'), (9781910797105, 'ch01', 'sect2', 'sect2'), (9781910797105, 'ch01', 'sect3', 'sect3')]

num_tok MOC 110
skipping [(9781910797105, 'ch01', 'sect4', 'sect4')]

EXCEPTION, last chunk too small.
num_tok MOC 1169
chunking [(9781910797105, 'ch01', 'ch01', 'ch01'), (9781910797105, 'ch01', 'sect1', 'sect1'), (9781910797105, 'ch01', 'sect2', 'sect2'), (9781910797105, 'ch01', 'sect3', 'sect3'), (9781910797105, 'ch01', 'sect4', 'sect4')]

##########


############################


book 9781910797105 chapter ch10

num_tok MOC 159
skipping [(9781910797105, 'ch10', 'ch10', 'ch10')]

merging 159 + 824
num_tok MOC 983
summ [(9781910797105, 'ch10', 'ch10', 'ch10'), (9781910797105, 'ch10', 'sect55', 'sect55')]

num_tok MOC 816
summ [(9781910797105, 'ch10', 'sect56', 'sect56')]

num_tok MOC 449
skipping [(9781910797105, 'ch10', 'sect57', 'sect57')]

merging 449 + 444
num_tok MOC 893
summ [(9781910797105, 'ch10', 'sect57', 'sect57'), (9781910797105, 'ch10', 'sect58', 'sect58')]

num_tok MOC 325
skipping [(9781910797105, 'ch10', 'sect59', 'sect59')]

merging 325 + 268
num_tok MOC 593
skipping [(9781910797105, 'ch10', 'sect59', 'sect59'), (9781910797105, 'ch10', 'sect60', 'sect60')]

EXCEPTION, last chunk too small.
num_tok MOC 1486
chunking [(9781910797105, 'ch10', 'sect57', 'sect57'), (9781910797105, 'ch10', 'sect58', 'sect58'), (9781910797105, 'ch10', 'sect59', 'sect59'), (9781910797105, 'ch10', 'sect60', 'sect60')]

#####################



############################


book 9781910797181 chapter ch03

num_tok MOC 9
skipping [(9781910797181, 'ch03', 'ch03', 'ch03')]

merging 9 + 101
num_tok MOC 110
skipping [(9781910797181, 'ch03', 'ch03', 'ch03'), (9781910797181, 'ch03', 'sect11', 'sect11')]

merging 110 + 240
num_tok MOC 350
skipping [(9781910797181, 'ch03', 'ch03', 'ch03'), (9781910797181, 'ch03', 'sect11', 'sect11'), (9781910797181, 'ch03', 'sect12', 'sect12')]

merging 350 + 131
num_tok MOC 481
skipping [(9781910797181, 'ch03', 'ch03', 'ch03'), (9781910797181, 'ch03', 'sect11', 'sect11'), (9781910797181, 'ch03', 'sect12', 'sect12'), (9781910797181, 'ch03', 'sect13', 'sect13')]

merging 481 + 67
num_tok MOC 548
skipping [(9781910797181, 'ch03', 'ch03', 'ch03'), (9781910797181, 'ch03', 'sect11', 'sect11'), (9781910797181, 'ch03', 'sect12', 'sect12'), (9781910797181, 'ch03', 'sect13', 'sect13'), (9781910797181, 'ch03', 'sect14', 'sect14')]

merging 548 + 701
num_tok MOC 1249
chunking [(9781910797181, 'ch03', 'ch03', 


num_tok MOC 173
skipping [(9781910797211, 'ch03', 'sect20', 'sect20')]

merging 173 + 420
num_tok MOC 593
skipping [(9781910797211, 'ch03', 'sect20', 'sect20'), (9781910797211, 'ch03', 'sect21', 'sect21')]

merging 593 + 1569
num_tok MOC 2162
chunking [(9781910797211, 'ch03', 'sect20', 'sect20'), (9781910797211, 'ch03', 'sect21', 'sect21'), (9781910797211, 'ch03', 'sect21', 'sect22_sub_sect21')]

num_tok MOC 312
skipping [(9781910797211, 'ch03', 'sect23', 'sect23')]

EXCEPTION, last chunk too small.
num_tok MOC 2474
chunking [(9781910797211, 'ch03', 'sect20', 'sect20'), (9781910797211, 'ch03', 'sect21', 'sect21'), (9781910797211, 'ch03', 'sect21', 'sect22_sub_sect21'), (9781910797211, 'ch03', 'sect23', 'sect23')]

############################


book 9781910797211 chapter ch04

num_tok MOC 3
skipping [(9781910797211, 'ch04', 'ch04', 'ch04')]

merging 3 + 270
num_tok MOC 273
skipping [(9781910797211, 'ch04', 'ch04', 'ch04'), (9781910797211, 'ch04', 'sect25', 'sect25')]

merging 273 + 13


num_tok MOC 361
skipping [(9781910797273, 'chp2', 'sect8', 'sect8')]

merging 361 + 376
num_tok MOC 737
summ [(9781910797273, 'chp2', 'sect8', 'sect8'), (9781910797273, 'chp2', 'sect9', 'sect9')]


############################


book 9781910797273 chapter chp3

num_tok MOC 4
skipping [(9781910797273, 'chp3', 'chp3', 'chp3')]

merging 4 + 379
num_tok MOC 383
skipping [(9781910797273, 'chp3', 'chp3', 'chp3'), (9781910797273, 'chp3', 'sect11', 'sect11')]

merging 383 + 328
num_tok MOC 711
skipping [(9781910797273, 'chp3', 'chp3', 'chp3'), (9781910797273, 'chp3', 'sect11', 'sect11'), (9781910797273, 'chp3', 'sect12', 'sect12')]

merging 711 + 704
num_tok MOC 1415
chunking [(9781910797273, 'chp3', 'chp3', 'chp3'), (9781910797273, 'chp3', 'sect11', 'sect11'), (9781910797273, 'chp3', 'sect12', 'sect12'), (9781910797273, 'chp3', 'sect13', 'sect13')]

num_tok MOC 426
skipping [(9781910797273, 'chp3', 'sect14', 'sect14')]

merging 426 + 2042
num_tok MOC 2468
chunking [(9781910797273, 'chp3', 's


num_tok MOC 251
skipping [(9781910797297, 'chp4', 'sec8', 'sec8')]

merging 251 + 529
num_tok MOC 780
summ [(9781910797297, 'chp4', 'sec8', 'sec8'), (9781910797297, 'chp4', 'sec9', 'sec9')]

num_tok MOC 495
skipping [(9781910797297, 'chp4', 'sect20', 'sect20')]

merging 495 + 494
num_tok MOC 989
summ [(9781910797297, 'chp4', 'sect20', 'sect20'), (9781910797297, 'chp4', 'sect21', 'sect21')]

num_tok MOC 349
skipping [(9781910797297, 'chp4', 'sect22', 'sect22')]

merging 349 + 316
num_tok MOC 665
skipping [(9781910797297, 'chp4', 'sect22', 'sect22'), (9781910797297, 'chp4', 'sect23', 'sect23')]

merging 665 + 369
num_tok MOC 1034
chunking [(9781910797297, 'chp4', 'sect22', 'sect22'), (9781910797297, 'chp4', 'sect23', 'sect23'), (9781910797297, 'chp4', 'sect24', 'sect24')]

num_tok MOC 330
skipping [(9781910797297, 'chp4', 'sect25', 'sect25')]

EXCEPTION, last chunk too small.
num_tok MOC 1364
chunking [(9781910797297, 'chp4', 'sect22', 'sect22'), (9781910797297, 'chp4', 'sect23', 'sect2


num_tok MOC 1052
chunking [(9781910797426, 'ch03', 'sec8', 'sec8')]

num_tok MOC 652
skipping [(9781910797426, 'ch03', 'sec9', 'sec9')]

merging 652 + 739
num_tok MOC 1391
chunking [(9781910797426, 'ch03', 'sec9', 'sec9'), (9781910797426, 'ch03', 'sec10', 'sec10')]

num_tok MOC 873
summ [(9781910797426, 'ch03', 'sec11', 'sec11')]

num_tok MOC 1046
chunking [(9781910797426, 'ch03', 'sec12', 'sec12')]

num_tok MOC 141
skipping [(9781910797426, 'ch03', 'sec13', 'sec13')]

EXCEPTION, last chunk too small.
num_tok MOC 1187
chunking [(9781910797426, 'ch03', 'sec12', 'sec12'), (9781910797426, 'ch03', 'sec13', 'sec13')]

############################


book 9781910797426 chapter ch04

num_tok MOC 16
skipping [(9781910797426, 'ch04', 'ch04', 'ch04')]

merging 16 + 224
num_tok MOC 240
skipping [(9781910797426, 'ch04', 'ch04', 'ch04'), (9781910797426, 'ch04', 'sec14', 'sec14')]

merging 240 + 60
num_tok MOC 300
skipping [(9781910797426, 'ch04', 'ch04', 'ch04'), (9781910797426, 'ch04', 'sec14', 's


############################


book 9781910797426 chapter ch12

num_tok MOC 64
skipping [(9781910797426, 'ch12', 'ch12', 'ch12')]

merging 64 + 1076
num_tok MOC 1140
chunking [(9781910797426, 'ch12', 'ch12', 'ch12'), (9781910797426, 'ch12', 'sec64', 'sec64')]

num_tok MOC 2604
chunking [(9781910797426, 'ch12', 'sec65', 'sec65')]

num_tok MOC 736
summ [(9781910797426, 'ch12', 'sec66', 'sec66')]

num_tok MOC 642
skipping [(9781910797426, 'ch12', 'sec67', 'sec67')]

merging 642 + 83
num_tok MOC 725
summ [(9781910797426, 'ch12', 'sec67', 'sec67'), (9781910797426, 'ch12', 'sec68', 'sec68')]


############################


book 9781910797433 chapter ch01

num_tok MOC 158
skipping [(9781910797433, 'ch01', 'ch01', 'ch01')]

merging 158 + 195
num_tok MOC 353
skipping [(9781910797433, 'ch01', 'ch01', 'ch01'), (9781910797433, 'ch01', 'sect1', 'sect1')]

merging 353 + 141
num_tok MOC 494
skipping [(9781910797433, 'ch01', 'ch01', 'ch01'), (9781910797433, 'ch01', 'sect1', 'sect1'), (9781910797433,


num_tok MOC 1136
chunking [(9781910797457, 'chp4', 'sect14', 'sect14')]

num_tok MOC 660
skipping [(9781910797457, 'chp4', 'sect15', 'sect15')]

merging 660 + 25
num_tok MOC 685
skipping [(9781910797457, 'chp4', 'sect15', 'sect15'), (9781910797457, 'chp4', 'sect16', 'sect16')]

EXCEPTION, last chunk too small.
num_tok MOC 1821
chunking [(9781910797457, 'chp4', 'sect14', 'sect14'), (9781910797457, 'chp4', 'sect15', 'sect15'), (9781910797457, 'chp4', 'sect16', 'sect16')]

############################


book 9781910797457 chapter chp5

num_tok MOC 374
skipping [(9781910797457, 'chp5', 'chp5', 'chp5')]

merging 374 + 364
num_tok MOC 738
summ [(9781910797457, 'chp5', 'chp5', 'chp5'), (9781910797457, 'chp5', 'sect18', 'sect18')]

num_tok MOC 1368
chunking [(9781910797457, 'chp5', 'sect19', 'sect19')]

num_tok MOC 1338
chunking [(9781910797457, 'chp5', 'sect20', 'sect20')]

num_tok MOC 309
skipping [(9781910797457, 'chp5', 'sect21', 'sect21')]

merging 309 + 616
num_tok MOC 925
summ [(978191


num_tok MOC 1196
chunking [(9781910797471, 'ch06', 'sect28', 'sect28')]

num_tok MOC 1222
chunking [(9781910797471, 'ch06', 'sect29', 'sect29')]

num_tok MOC 1980
chunking [(9781910797471, 'ch06', 'sect30', 'sect30')]


############################


book 9781910797471 chapter ch07

num_tok MOC 337
skipping [(9781910797471, 'ch07', 'ch07', 'ch07')]

merging 337 + 328
num_tok MOC 665
skipping [(9781910797471, 'ch07', 'ch07', 'ch07'), (9781910797471, 'ch07', 'sect32', 'sect32')]

merging 665 + 3999
num_tok MOC 4664
chunking [(9781910797471, 'ch07', 'ch07', 'ch07'), (9781910797471, 'ch07', 'sect32', 'sect32'), (9781910797471, 'ch07', 'sect33', 'sect33')]

num_tok MOC 1503
chunking [(9781910797471, 'ch07', 'sect34', 'sect34')]

num_tok MOC 314
skipping [(9781910797471, 'ch07', 'sect35', 'sect35')]

merging 314 + 298
num_tok MOC 612
skipping [(9781910797471, 'ch07', 'sect35', 'sect35'), (9781910797471, 'ch07', 'sect36', 'sect36')]

merging 612 + 261
num_tok MOC 873
summ [(9781910797471, 'c


num_tok MOC 2007
chunking [(9781910797495, 'chp6', 'sect45', 'sect45')]


############################


book 9781910797495 chapter chp7

num_tok MOC 161
skipping [(9781910797495, 'chp7', 'chp7', 'chp7')]

merging 161 + 647
num_tok MOC 808
summ [(9781910797495, 'chp7', 'chp7', 'chp7'), (9781910797495, 'chp7', 'sect48', 'sect48')]

num_tok MOC 668
skipping [(9781910797495, 'chp7', 'sect49', 'sect49')]

merging 668 + 130
num_tok MOC 798
summ [(9781910797495, 'chp7', 'sect49', 'sect49'), (9781910797495, 'chp7', 'sect50', 'sect50')]

num_tok MOC 94
skipping [(9781910797495, 'chp7', 'sect51', 'sect51')]

merging 94 + 156
num_tok MOC 250
skipping [(9781910797495, 'chp7', 'sect51', 'sect51'), (9781910797495, 'chp7', 'sect52', 'sect52')]

merging 250 + 285
num_tok MOC 535
skipping [(9781910797495, 'chp7', 'sect51', 'sect51'), (9781910797495, 'chp7', 'sect52', 'sect52'), (9781910797495, 'chp7', 'sect53', 'sect53')]

merging 535 + 605
num_tok MOC 1140
chunking [(9781910797495, 'chp7', 'sect51',


############################


book 9781910797556 chapter chp2

num_tok MOC 8
skipping [(9781910797556, 'chp2', 'chp2', 'chp2')]

merging 8 + 119
num_tok MOC 127
skipping [(9781910797556, 'chp2', 'chp2', 'chp2'), (9781910797556, 'chp2', 'sect7', 'sect7')]

merging 127 + 987
num_tok MOC 1114
chunking [(9781910797556, 'chp2', 'chp2', 'chp2'), (9781910797556, 'chp2', 'sect7', 'sect7'), (9781910797556, 'chp2', 'sect8', 'sect8')]

num_tok MOC 857
summ [(9781910797556, 'chp2', 'sect9', 'sect9')]


############################


book 9781910797556 chapter chp3

num_tok MOC 11
skipping [(9781910797556, 'chp3', 'chp3', 'chp3')]

merging 11 + 430
num_tok MOC 441
skipping [(9781910797556, 'chp3', 'chp3', 'chp3'), (9781910797556, 'chp3', 'sect11', 'sect11')]

merging 441 + 516
num_tok MOC 957
summ [(9781910797556, 'chp3', 'chp3', 'chp3'), (9781910797556, 'chp3', 'sect11', 'sect11'), (9781910797556, 'chp3', 'sect12', 'sect12')]

num_tok MOC 768
summ [(9781910797556, 'chp3', 'sect13', 'sect13')]

n


############################


book 9781910797587 chapter chp09

num_tok MOC 7
skipping [(9781910797587, 'chp09', 'chp09', 'chp09')]

merging 7 + 1309
num_tok MOC 1316
chunking [(9781910797587, 'chp09', 'chp09', 'chp09'), (9781910797587, 'chp09', 'sect51', 'sect51')]

num_tok MOC 497
skipping [(9781910797587, 'chp09', 'sect52', 'sect52')]

merging 497 + 157
num_tok MOC 654
skipping [(9781910797587, 'chp09', 'sect52', 'sect52'), (9781910797587, 'chp09', 'sect53', 'sect53')]

EXCEPTION, last chunk too small.
num_tok MOC 1970
chunking [(9781910797587, 'chp09', 'chp09', 'chp09'), (9781910797587, 'chp09', 'sect51', 'sect51'), (9781910797587, 'chp09', 'sect52', 'sect52'), (9781910797587, 'chp09', 'sect53', 'sect53')]

############################


book 9781910797587 chapter chp10

num_tok MOC 112
skipping [(9781910797587, 'chp10', 'chp10', 'chp10')]

merging 112 + 226
num_tok MOC 338
skipping [(9781910797587, 'chp10', 'chp10', 'chp10'), (9781910797587, 'chp10', 'sect55', 'sect55')]

mergin



############################


book 9781910797631 chapter chp4

num_tok MOC 5
skipping [(9781910797631, 'chp4', 'chp4', 'chp4')]

merging 5 + 692
num_tok MOC 697
skipping [(9781910797631, 'chp4', 'chp4', 'chp4'), (9781910797631, 'chp4', 'sect19', 'sect19')]

merging 697 + 1249
num_tok MOC 1946
chunking [(9781910797631, 'chp4', 'chp4', 'chp4'), (9781910797631, 'chp4', 'sect19', 'sect19'), (9781910797631, 'chp4', 'sect20', 'sect20')]

num_tok MOC 956
summ [(9781910797631, 'chp4', 'sect21', 'sect21')]

num_tok MOC 165
skipping [(9781910797631, 'chp4', 'sect22', 'sect22')]

merging 165 + 1708
num_tok MOC 1873
chunking [(9781910797631, 'chp4', 'sect22', 'sect22'), (9781910797631, 'chp4', 'sect23', 'sect23')]


############################


book 9781910797631 chapter chp5

num_tok MOC 34
skipping [(9781910797631, 'chp5', 'chp5', 'chp5')]

merging 34 + 112
num_tok MOC 146
skipping [(9781910797631, 'chp5', 'chp5', 'chp5'), (9781910797631, 'chp5', 'sect25', 'sect25')]

merging 146 + 204
num_



############################


book 9781910797662 chapter ch05

num_tok MOC 10
skipping [(9781910797662, 'ch05', 'ch05', 'ch05')]

merging 10 + 122
num_tok MOC 132
skipping [(9781910797662, 'ch05', 'ch05', 'ch05'), (9781910797662, 'ch05', 'sect20', 'sect20')]

merging 132 + 287
num_tok MOC 419
skipping [(9781910797662, 'ch05', 'ch05', 'ch05'), (9781910797662, 'ch05', 'sect20', 'sect20'), (9781910797662, 'ch05', 'sect21', 'sect21')]

merging 419 + 122
num_tok MOC 541
skipping [(9781910797662, 'ch05', 'ch05', 'ch05'), (9781910797662, 'ch05', 'sect20', 'sect20'), (9781910797662, 'ch05', 'sect21', 'sect21'), (9781910797662, 'ch05', 'sect22', 'sect22')]

merging 541 + 891
num_tok MOC 1432
chunking [(9781910797662, 'ch05', 'ch05', 'ch05'), (9781910797662, 'ch05', 'sect20', 'sect20'), (9781910797662, 'ch05', 'sect21', 'sect21'), (9781910797662, 'ch05', 'sect22', 'sect22'), (9781910797662, 'ch05', 'sect23', 'sect23')]

num_tok MOC 1016
summ [(9781910797662, 'ch05', 'sect24', 'sect24')]

num_


############################


book 9781910797693 chapter chp7

num_tok MOC 116
skipping [(9781910797693, 'chp7', 'sect24', 'sect24')]

merging 116 + 927
num_tok MOC 1043
chunking [(9781910797693, 'chp7', 'sect24', 'sect24'), (9781910797693, 'chp7', 'sect25', 'sect25')]

num_tok MOC 414
skipping [(9781910797693, 'chp7', 'sect26', 'sect26')]

merging 414 + 120
num_tok MOC 534
skipping [(9781910797693, 'chp7', 'sect26', 'sect26'), (9781910797693, 'chp7', 'sect27', 'sect27')]

merging 534 + 275
num_tok MOC 809
summ [(9781910797693, 'chp7', 'sect26', 'sect26'), (9781910797693, 'chp7', 'sect27', 'sect27'), (9781910797693, 'chp7', 'sect28', 'sect28')]

num_tok MOC 51
skipping [(9781910797693, 'chp7', 'sect29', 'sect29')]

EXCEPTION, last chunk too small.
num_tok MOC 860
summ [(9781910797693, 'chp7', 'sect26', 'sect26'), (9781910797693, 'chp7', 'sect27', 'sect27'), (9781910797693, 'chp7', 'sect28', 'sect28'), (9781910797693, 'chp7', 'sect29', 'sect29')]

############################


book 9


num_tok MOC 461
skipping [(9781910797815, 'chp4', 'sec18', 'sec18')]

merging 461 + 490
num_tok MOC 951
summ [(9781910797815, 'chp4', 'sec18', 'sec18'), (9781910797815, 'chp4', 'sec19', 'sec19')]

num_tok MOC 206
skipping [(9781910797815, 'chp4', 'sec20', 'sec20')]

merging 206 + 270
num_tok MOC 476
skipping [(9781910797815, 'chp4', 'sec20', 'sec20'), (9781910797815, 'chp4', 'sec21', 'sec21')]

EXCEPTION, last chunk too small.
num_tok MOC 1427
chunking [(9781910797815, 'chp4', 'sec18', 'sec18'), (9781910797815, 'chp4', 'sec19', 'sec19'), (9781910797815, 'chp4', 'sec20', 'sec20'), (9781910797815, 'chp4', 'sec21', 'sec21')]

############################


book 9781910797815 chapter chp5

num_tok MOC 10
skipping [(9781910797815, 'chp5', 'chp5', 'chp5')]

merging 10 + 246
num_tok MOC 256
skipping [(9781910797815, 'chp5', 'chp5', 'chp5'), (9781910797815, 'chp5', 'sec22', 'sec22')]

merging 256 + 237
num_tok MOC 493
skipping [(9781910797815, 'chp5', 'chp5', 'chp5'), (9781910797815, 'chp5', 


num_tok MOC 205
skipping [(9781910797853, 'chp6', 'sec26', 'sec26')]

merging 205 + 175
num_tok MOC 380
skipping [(9781910797853, 'chp6', 'sec26', 'sec26'), (9781910797853, 'chp6', 'sec27', 'sec27')]

merging 380 + 142
num_tok MOC 522
skipping [(9781910797853, 'chp6', 'sec26', 'sec26'), (9781910797853, 'chp6', 'sec27', 'sec27'), (9781910797853, 'chp6', 'sec28', 'sec28')]

merging 522 + 243
num_tok MOC 765
summ [(9781910797853, 'chp6', 'sec26', 'sec26'), (9781910797853, 'chp6', 'sec27', 'sec27'), (9781910797853, 'chp6', 'sec28', 'sec28'), (9781910797853, 'chp6', 'sec29', 'sec29')]


############################


book 9781910797853 chapter chp7

num_tok MOC 93
skipping [(9781910797853, 'chp7', 'chp7', 'chp7')]

merging 93 + 335
num_tok MOC 428
skipping [(9781910797853, 'chp7', 'chp7', 'chp7'), (9781910797853, 'chp7', 'sec30', 'sec30')]

merging 428 + 512
num_tok MOC 940
summ [(9781910797853, 'chp7', 'chp7', 'chp7'), (9781910797853, 'chp7', 'sec30', 'sec30'), (9781910797853, 'chp7', 'se


############################


book 9781912776139 chapter ch5

num_tok MOC 185
skipping [(9781912776139, 'ch5', 'ch5', 'ch5')]

merging 185 + 190
num_tok MOC 375
skipping [(9781912776139, 'ch5', 'ch5', 'ch5'), (9781912776139, 'ch5', 'sec28', 'sec28')]

merging 375 + 1855
num_tok MOC 2230
chunking [(9781912776139, 'ch5', 'ch5', 'ch5'), (9781912776139, 'ch5', 'sec28', 'sec28'), (9781912776139, 'ch5', 'sec29', 'sec29')]

num_tok MOC 396
skipping [(9781912776139, 'ch5', 'sec30', 'sec30')]

merging 396 + 107
num_tok MOC 503
skipping [(9781912776139, 'ch5', 'sec30', 'sec30'), (9781912776139, 'ch5', 'sec31', 'sec31')]

merging 503 + 360
num_tok MOC 863
summ [(9781912776139, 'ch5', 'sec30', 'sec30'), (9781912776139, 'ch5', 'sec31', 'sec31'), (9781912776139, 'ch5', 'sec32', 'sec32')]

num_tok MOC 108
skipping [(9781912776139, 'ch5', 'sec33', 'sec33')]

merging 108 + 764
num_tok MOC 872
summ [(9781912776139, 'ch5', 'sec33', 'sec33'), (9781912776139, 'ch5', 'sec34', 'sec34')]


#################


############################


book 9781912776153 chapter chp7

num_tok MOC 9
skipping [(9781912776153, 'chp7', 'chp7', 'chp7')]

merging 9 + 293
num_tok MOC 302
skipping [(9781912776153, 'chp7', 'chp7', 'chp7'), (9781912776153, 'chp7', 'sec49', 'sec49')]

merging 302 + 141
num_tok MOC 443
skipping [(9781912776153, 'chp7', 'chp7', 'chp7'), (9781912776153, 'chp7', 'sec49', 'sec49'), (9781912776153, 'chp7', 'sec50', 'sec50')]

merging 443 + 482
num_tok MOC 925
summ [(9781912776153, 'chp7', 'chp7', 'chp7'), (9781912776153, 'chp7', 'sec49', 'sec49'), (9781912776153, 'chp7', 'sec50', 'sec50'), (9781912776153, 'chp7', 'sec51', 'sec51')]

num_tok MOC 1133
chunking [(9781912776153, 'chp7', 'sec52', 'sec52')]

num_tok MOC 275
skipping [(9781912776153, 'chp7', 'sec52', 'sec52-1_sub_sec52')]

merging 275 + 203
num_tok MOC 478
skipping [(9781912776153, 'chp7', 'sec52', 'sec52-1_sub_sec52'), (9781912776153, 'chp7', 'sec52', 'sec52-2_sub_sec52')]

merging 478 + 63
num_tok MOC 541
skipping [(9781912


num_tok MOC 100
skipping [(9781912776207, 'ch1', 'sec2', 'sec2')]

EXCEPTION, last chunk too small.
num_tok MOC 1174
chunking [(9781912776207, 'ch1', 'ch1', 'ch1'), (9781912776207, 'ch1', 'sec1', 'sec1'), (9781912776207, 'ch1', 'sec2', 'sec2')]

############################


book 9781912776207 chapter ch2

num_tok MOC 386
skipping [(9781912776207, 'ch2', 'ch2', 'ch2')]

merging 386 + 92
num_tok MOC 478
skipping [(9781912776207, 'ch2', 'ch2', 'ch2'), (9781912776207, 'ch2', 'sec3', 'sec3')]

merging 478 + 763
num_tok MOC 1241
chunking [(9781912776207, 'ch2', 'ch2', 'ch2'), (9781912776207, 'ch2', 'sec3', 'sec3'), (9781912776207, 'ch2', 'sec4', 'sec4')]

num_tok MOC 1025
chunking [(9781912776207, 'ch2', 'sec5', 'sec5')]

num_tok MOC 618
skipping [(9781912776207, 'ch2', 'sec6', 'sec6')]

merging 618 + 263
num_tok MOC 881
summ [(9781912776207, 'ch2', 'sec6', 'sec6'), (9781912776207, 'ch2', 'sec7', 'sec7')]

num_tok MOC 531
skipping [(9781912776207, 'ch2', 'sec8', 'sec8')]

merging 531 + 67


num_tok MOC 187
skipping [(9781912776238, 'ch4', 'sec12', 'sec12')]

EXCEPTION, last chunk too small.
num_tok MOC 1064
chunking [(9781912776238, 'ch4', 'ch4', 'ch4'), (9781912776238, 'ch4', 'sec10', 'sec10'), (9781912776238, 'ch4', 'sec11', 'sec11'), (9781912776238, 'ch4', 'sec12', 'sec12')]

############################


book 9781912776238 chapter ch5

num_tok MOC 55
skipping [(9781912776238, 'ch5', 'ch5', 'ch5')]

merging 55 + 894
num_tok MOC 949
summ [(9781912776238, 'ch5', 'ch5', 'ch5'), (9781912776238, 'ch5', 'sec13', 'sec13')]

num_tok MOC 609
skipping [(9781912776238, 'ch5', 'sec14', 'sec14')]

EXCEPTION, last chunk too small.
num_tok MOC 1558
chunking [(9781912776238, 'ch5', 'ch5', 'ch5'), (9781912776238, 'ch5', 'sec13', 'sec13'), (9781912776238, 'ch5', 'sec14', 'sec14')]

############################


book 9781912776238 chapter ch6

num_tok MOC 127
skipping [(9781912776238, 'ch6', 'ch6', 'ch6')]

merging 127 + 395
num_tok MOC 522
skipping [(9781912776238, 'ch6', 'ch6', 'ch6


num_tok MOC 1261
chunking [(9781912776276, 'chp4', 'sec17', 'sec17')]

num_tok MOC 525
skipping [(9781912776276, 'chp4', 'sec18', 'sec18')]

EXCEPTION, last chunk too small.
num_tok MOC 1786
chunking [(9781912776276, 'chp4', 'sec17', 'sec17'), (9781912776276, 'chp4', 'sec18', 'sec18')]

############################


book 9781912776276 chapter chp5

num_tok MOC 439
skipping [(9781912776276, 'chp5', 'sec19', 'sec19')]

merging 439 + 436
num_tok MOC 875
summ [(9781912776276, 'chp5', 'sec19', 'sec19'), (9781912776276, 'chp5', 'sec20', 'sec20')]

num_tok MOC 3999
chunking [(9781912776276, 'chp5', 'sec21', 'sec21')]

num_tok MOC 231
skipping [(9781912776276, 'chp5', 'sec22', 'sec22')]

merging 231 + 277
num_tok MOC 508
skipping [(9781912776276, 'chp5', 'sec22', 'sec22'), (9781912776276, 'chp5', 'sec23', 'sec23')]

merging 508 + 133
num_tok MOC 641
skipping [(9781912776276, 'chp5', 'sec22', 'sec22'), (9781912776276, 'chp5', 'sec23', 'sec23'), (9781912776276, 'chp5', 'sec24', 'sec24')]

merg



############################


book 9781912776726 chapter ch2

num_tok MOC 154
skipping [(9781912776726, 'ch2', 'ch2', 'ch2')]

merging 154 + 421
num_tok MOC 575
skipping [(9781912776726, 'ch2', 'ch2', 'ch2'), (9781912776726, 'ch2', 'sec5', 'sec5')]

merging 575 + 572
num_tok MOC 1147
chunking [(9781912776726, 'ch2', 'ch2', 'ch2'), (9781912776726, 'ch2', 'sec5', 'sec5'), (9781912776726, 'ch2', 'sec6', 'sec6')]


############################


book 9781912776726 chapter ch3

num_tok MOC 463
skipping [(9781912776726, 'ch3', 'sec7', 'sec7')]

merging 463 + 533
num_tok MOC 996
summ [(9781912776726, 'ch3', 'sec7', 'sec7'), (9781912776726, 'ch3', 'sec8', 'sec8')]

num_tok MOC 3583
chunking [(9781912776726, 'ch3', 'sec9', 'sec9')]

num_tok MOC 252
skipping [(9781912776726, 'ch3', 'sec10', 'sec10')]

EXCEPTION, last chunk too small.
num_tok MOC 3835
chunking [(9781912776726, 'ch3', 'sec9', 'sec9'), (9781912776726, 'ch3', 'sec10', 'sec10')]

############################


book 9781912776726 c


############################


book 9783318066241 chapter ch4

num_tok MOC 190
skipping [(9783318066241, 'ch4', 'ch4', 'ch4')]

merging 190 + 81
num_tok MOC 271
skipping [(9783318066241, 'ch4', 'ch4', 'ch4'), (9783318066241, 'ch4', 'sec11', 'sec11')]

merging 271 + 854
num_tok MOC 1125
chunking [(9783318066241, 'ch4', 'ch4', 'ch4'), (9783318066241, 'ch4', 'sec11', 'sec11'), (9783318066241, 'ch4', 'sec12', 'sec12')]

num_tok MOC 439
skipping [(9783318066241, 'ch4', 'sec13', 'sec13')]

EXCEPTION, last chunk too small.
num_tok MOC 1564
chunking [(9783318066241, 'ch4', 'ch4', 'ch4'), (9783318066241, 'ch4', 'sec11', 'sec11'), (9783318066241, 'ch4', 'sec12', 'sec12'), (9783318066241, 'ch4', 'sec13', 'sec13')]

############################


book 9783318066241 chapter ch5

num_tok MOC 150
skipping [(9783318066241, 'ch5', 'ch5', 'ch5')]

merging 150 + 799
num_tok MOC 949
summ [(9783318066241, 'ch5', 'ch5', 'ch5'), (9783318066241, 'ch5', 'sec14', 'sec14')]

num_tok MOC 401
skipping [(978331806


num_tok MOC 229
skipping [(9783318066685, 'ch3', 'sec39', 'sec43_sub_sec39')]

merging 229 + 208
num_tok MOC 437
skipping [(9783318066685, 'ch3', 'sec39', 'sec43_sub_sec39'), (9783318066685, 'ch3', 'sec39', 'sec44_sub_sec39')]

merging 437 + 183
num_tok MOC 620
skipping [(9783318066685, 'ch3', 'sec39', 'sec43_sub_sec39'), (9783318066685, 'ch3', 'sec39', 'sec44_sub_sec39'), (9783318066685, 'ch3', 'sec39', 'sec45_sub_sec39')]

merging 620 + 234
num_tok MOC 854
summ [(9783318066685, 'ch3', 'sec39', 'sec43_sub_sec39'), (9783318066685, 'ch3', 'sec39', 'sec44_sub_sec39'), (9783318066685, 'ch3', 'sec39', 'sec45_sub_sec39'), (9783318066685, 'ch3', 'sec39', 'sec46_sub_sec39')]

num_tok MOC 328
skipping [(9783318066685, 'ch3', 'sec47', 'sec47')]

merging 328 + 270
num_tok MOC 598
skipping [(9783318066685, 'ch3', 'sec47', 'sec47'), (9783318066685, 'ch3', 'sec48', 'sec48')]

merging 598 + 284
num_tok MOC 882
summ [(9783318066685, 'ch3', 'sec47', 'sec47'), (9783318066685, 'ch3', 'sec48', 'sec48'),


############################


book 9783318067095 chapter ch8

num_tok MOC 485
skipping [(9783318067095, 'ch8', 'ch8', 'ch8')]

merging 485 + 745
num_tok MOC 1230
chunking [(9783318067095, 'ch8', 'ch8', 'ch8'), (9783318067095, 'ch8', 'sec25', 'sec25')]

num_tok MOC 281
skipping [(9783318067095, 'ch8', 'sec26', 'sec26')]

EXCEPTION, last chunk too small.
num_tok MOC 1511
chunking [(9783318067095, 'ch8', 'ch8', 'ch8'), (9783318067095, 'ch8', 'sec25', 'sec25'), (9783318067095, 'ch8', 'sec26', 'sec26')]

############################


book 9783318067095 chapter ch9

num_tok MOC 56
skipping [(9783318067095, 'ch9', 'ch9', 'ch9')]

merging 56 + 1468
num_tok MOC 1524
chunking [(9783318067095, 'ch9', 'ch9', 'ch9'), (9783318067095, 'ch9', 'sec27', 'sec27')]

num_tok MOC 152
skipping [(9783318067095, 'ch9', 'sec28', 'sec28')]

merging 152 + 291
num_tok MOC 443
skipping [(9783318067095, 'ch9', 'sec28', 'sec28'), (9783318067095, 'ch9', 'sec29', 'sec29')]

EXCEPTION, last chunk too small.
num_tok M

In [13]:
df_moc = df_moc.reset_index(level=[2, 3]).groupby(['book', 'chapter'], sort=False).agg({
    'merge': lambda m: list(m),
    'chunk': lambda c: list(c),
    'text': lambda t: list(t),
    'bullets': lambda b: list(b)[0]
})

#### Save new dataset

In [14]:
df_moc.to_csv(OUTPUT_PATH+'df.csv')

#### Create train, test, validation

Generate files:
* train.source
* train.target
* val.source
* val.target
* test.source
* test.target

Since we do not have a lot of samples, validation and test will be equal for us.

In [15]:
df_moc.bullets = df_moc.bullets.map(lambda b: ' '.join(b))

In [16]:
df_moc = df_moc.sample(frac=1, random_state=config.SEED)
df_moc['num_chunks'] = df_moc.text.map(len).cumsum()
tot_chunk = df_moc.num_chunks.iloc[-1]
split1 = np.where(df_moc.num_chunks > int(tot_chunk*0.8))[0][0]+1
split2 = np.where(df_moc.num_chunks > int(tot_chunk*0.9))[0][0]+1
print(split1, split2)

365 409


In [17]:
train, val, test =\
    df_moc.iloc[:split1].explode('text'),\
    df_moc.iloc[split1:split2].explode('text'),\
    df_moc.iloc[split2:].explode('text')

train.to_csv(OUTPUT_PATH+'train.csv')
val.to_csv(OUTPUT_PATH+'val.csv')
test.to_csv(OUTPUT_PATH+'test.csv')

In [18]:
with open(OUTPUT_PATH+'train.source', 'w') as tr_s,\
    open(OUTPUT_PATH+'train.target', 'w') as tr_t,\
    open(OUTPUT_PATH+'train.index', 'w') as tr_i:
    for idx, row in train[['text', 'bullets']].iterrows():
        tr_i.write(str(idx) + '\n')
        tr_s.write(row.text + '\n')
        tr_t.write(row.bullets + '\n')

In [19]:
with open(OUTPUT_PATH+'val.source', 'w') as va_s,\
    open(OUTPUT_PATH+'val.target', 'w') as va_t,\
    open(OUTPUT_PATH+'val.index', 'w') as va_i:
    for idx, row in val[['text', 'bullets']].iterrows():
        va_i.write(str(idx) + '\n')
        va_s.write(row.text + '\n')
        va_t.write(row.bullets + '\n')

In [20]:
with open(OUTPUT_PATH+'test.source', 'w') as te_s,\
    open(OUTPUT_PATH+'test.target', 'w') as te_t,\
    open(OUTPUT_PATH+'test.index', 'w') as te_i:
    for idx, row in test[['text', 'bullets']].iterrows():
        te_i.write(str(idx) + '\n')
        te_s.write(row.text + '\n')
        te_t.write(row.bullets + '\n')