#### For Colab

In [1]:
"""
function ClickConnect(){
    console.log("Working");
    document.querySelector("colab-toolbar-button").click() 
}
var i = setInterval(ClickConnect, 900000)
clearInterval(i)
"""

'\nfunction ClickConnect(){\n    console.log("Working");\n    document.querySelector("colab-toolbar-button").click() \n}\nvar i = setInterval(ClickConnect, 900000)\nclearInterval(i)\n'

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
drive_dir = '/content/drive/My Drive/MAGMA: Summarization/'

#### Install Libraries

In [4]:
!pip install transformers==4.1.1
!pip install -U sentencepiece!=0.1.92
!pip install -U datasets
!pip install rouge_score
!pip install -U gensim

Collecting transformers==4.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 11.2MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 49.0MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 49.7MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=313c4529ef5

### **Config**

In [5]:
import os
import sys

sys.path.insert(0, drive_dir)
import config

In [6]:
# General configurations

MODEL = 'bart'

RE_SPLITTER = '\.(?!\d)|\n'     # do we split sentences of paragraphs?
                                # use '\.|\n' or '\n', respectively

TOKEN_MAX_LEN = 99              # max length of a word
PARA_MIN_LENGTH = 2             # minimum length for a sentence or
                                # a paragraph, in tokens

# Output path
OUTPUT_PATH = drive_dir+'datasets/karger_books_moc/'+MODEL+'/'
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

### **Init**

In [7]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import re
import pandas as pd
import gensim

if 'pegasus' in MODEL:
    from transformers import PegasusTokenizer
    tokenizer =\
        PegasusTokenizer.from_pretrained('google/pegasus-large')
elif 'bart' in MODEL:
    from transformers import BartTokenizer
    tokenizer =\
        BartTokenizer.from_pretrained('facebook/bart-large-cnn')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




### **Karger Books Base Dataset**

In [None]:
base_dataset = drive_dir+'datasets/karger_books_base/df.csv'
df = pd.read_csv(base_dataset)
df = df.set_index(['book', 'chapter', 'section', 'subsection'])
df.bullets = df.bullets.map(eval, na_action='ignore')

### **Preprocessing**

#### Preprocessing

* Split based on RE_SPLITTER
* Explode the dataset
* Remove unwanted chars at beginning or end of sentence
* Remove multiple spaces
* Remove long words (> TOKEN_MAX_LEN chars)
* Remove short sentences / paragraphs (< PARA_MIN_LENGTH tokens)

In [None]:
# Split in sentences / paragraphs based on RE_SPLITTER
df.text =\
    df.text.map(lambda x: [p.strip() for p in re.split(RE_SPLITTER, x) if p!=''],
                na_action='ignore')
    
# explode to get one row for each paragraph /sentence
df = df.explode('text')
df = df.rename(columns={'text': 'para'})
df = df.dropna()

# Remove unwanted chars at beginning or end of sentence
df.para = df.para.map(lambda p: p.lstrip('.,;:-)] \n'))
df.para = df.para.map(lambda p: p.rstrip('.,;:-([ \n'))

# Remove multiple spaces
df.para = df.para.map(lambda p:
    re.sub('\s+', ' ', p).strip())

# Remove long words (> TOKEN_MAX_LEN chars)
def para2words(para):
    return gensim.utils.simple_preprocess(
        para, deacc=True, max_len=TOKEN_MAX_LEN)
df['para_proc'] = df.para.map(para2words)

# Remove short sentences / paragraphs (< PARA_MIN_LENGTH tokens)
df.loc[df.para_proc.map(len) <\
    PARA_MIN_LENGTH, 'para_proc'] = np.nan

df = df.dropna()

### **Merge or Chunk Sections**

In [None]:
df = df.groupby(level=[0,1,2,3], sort=False).agg(
    {'para': lambda t: ''.join([p+'. ' for p in t]),
     'bullets': lambda b: list(b)[0]})
df = df.rename(columns={'para': 'text'})

In [None]:
df['text_enc'] = df.text.map(tokenizer.tokenize)
df['text_num_tokens'] = df.text_enc.map(len)

#### Chunk function

In [None]:
def chunk_text(text, num_tok):
    text_sent =\
        [sent.strip()+'.' for sent in re.split(RE_SPLITTER, text) if len(sent) > 1]

    # calculate number of tokens per sentence
    num_tok_sent = [len(tokenizer.tokenize(sent)) for sent in text_sent]
    
    # calculate chunk dimension to fit into model
    n = int(np.ceil(num_tok / config.MODEL_MAX_LEN))
    len_chunk = int(num_tok / n)
    # get a more uniform splitting to avoid splits
    # which are too short at the end
    if len_chunk+50 > config.MODEL_MAX_LEN:
        len_chunk = int(num_tok / (n+1))
    
    len_curr = 0
    text_curr = []
    text_chunk = []
    for te, len_sent in zip(text_sent, num_tok_sent):

        if len_curr + len_sent < len_chunk:
            text_curr.append(te)
            len_curr += len_sent

        elif len_curr + len_sent >= config.MODEL_MAX_LEN:
            text_chunk.append(text_curr)

            text_curr = [te]
            len_curr = len_sent

        else: # >= len_chunk && < MODEL_MAX_LEN
            text_curr.append(te)
            text_chunk.append(text_curr)
            
            text_curr = []
            len_curr = 0

    if len_curr > 0:
        text_chunk.append(text_curr)

    return text_chunk

#### Merge or Chunk

In [None]:
def chunk_or_merge(df_merge, num_row, num_tok, bid, cpt, i, index):
    print('num_tok MOC', num_tok)

    # if the text fits perfectly, we just add it to df_merge
    if num_tok >= CHUNK_MIN_LEN and num_tok < config.MODEL_MAX_LEN:
        id_merged = index[i-num_row:i]
        print('summ', id_merged)

        data_merged = []
        for c in df.columns:
            if c == 'bullets':
                data_merged.append(df.loc[id_merged, c].iloc[0])
            else:
                data_merged.append(df.loc[id_merged, c].sum())

        new_index = pd.MultiIndex.from_arrays(
            [[bid], [cpt], [str(id_merged)], [False]])
        df_merge_add = pd.DataFrame(
            data = [data_merged],
            index = new_index,
            columns = df.columns)
        df_merge = df_merge.append(df_merge_add)
        return False, df_merge

    # if the text is too long, we chunk it
    elif num_tok >= config.MODEL_MAX_LEN:
        id_merged = index[i-num_row:i]
        print('chunking', id_merged)

        new_index = pd.MultiIndex.from_arrays(
            [[bid], [cpt], [str(id_merged)], [True]])

        text_merged = df.loc[id_merged, 'text'].sum()
        text_chunked = chunk_text(text_merged, num_tok)
        for tc in text_chunked:
            t = ' '.join(tc)
            df_merge_add = pd.DataFrame(
                data = [[t,
                        df.iloc[i-1].bullets,
                        tokenizer.tokenize(t),
                        len(tokenizer.tokenize(t))]],
                index = new_index,
                columns = df.columns)
            df_merge = df_merge.append(df_merge_add)
        return False, df_merge

    # if the text is too short, we merge it
    elif num_tok < CHUNK_MIN_LEN:
        print('skipping', index[i-num_row:i])
        return True, df_merge

#### Create the new dataset

In [None]:
# define new dataset for the chapters
indices = pd.MultiIndex.from_arrays(
    [[], [], [], []],
    names = ['book', 'chapter', 'merge', 'chunk'])
df_moc = pd.DataFrame(index = indices, columns = df.columns.tolist())

# minimum length of a chunk
CHUNK_MIN_LEN = np.ceil(config.BULLETS_MED_LEN / config.MAX_RATIO)
print('Chunk minimum length:', CHUNK_MIN_LEN)
print()

index = df.index.tolist()

i_cpt = 0
for bid in sorted(list(set(df.index.get_level_values(0)))):
    cpts = config.get_cpts(df, bid)
    for cpt in cpts:
        print('book', bid, 'chapter', cpt)
        print()

        merge = False
        num_row, num_row_past = 0, 0
        num_tok, num_tok_past = 0, 0
        max_i = i_cpt + len(df.loc[bid, cpt])

        # for each section in chapter
        for i in range(i_cpt, max_i):
            # if previous section is too short, 
            # we merge it with the one below
            if merge: 
                num_row += 1
                print('merging', num_tok, '+', df.iloc[i].text_num_tokens)
                num_tok += df.iloc[i].text_num_tokens
            else:
                num_row_past = num_row
                num_row = 1
                num_tok_past = num_tok
                num_tok = df.iloc[i].text_num_tokens
            '''
                after merging or considering a new section, we call this function
                to study the length. Then we either add the row to the dataset, chunk it 
                or do nothing if it is still too short
            '''
            merge, df_moc =\
                chunk_or_merge(df_moc, num_row, num_tok, bid, cpt, i+1, index)

            print()
            
            '''
                if the last chunk of the chapter is too small, we merge it
                with the above and repeat one last time the process
            '''
            if merge and (i+1 == max_i):
                print('EXCEPTION, last chunk too small.')
                if num_row_past == 0:
                    _, df_moc =\
                        chunk_or_merge(df_moc, num_row, config.MODEL_MAX_LEN-1, bid, cpt, i+1, index)
                else:
                    num_row += num_row_past
                    num_tok += num_tok_past
                    # drop last row added and re-calculate for new, bigger chunk
                    df_moc = df_moc.drop(df_moc.index[-1])
                    _, df_moc =\
                        chunk_or_merge(df_moc, num_row, num_tok, bid, cpt, i+1, index)
                 
        i_cpt += len(df.loc[bid, cpt])
        print('\n############################')
        print()
        print()

for idx, r in df_moc.groupby(level=[0,1,2,3]).text.apply(list).iteritems():
    idx_merged = eval(idx[2])
    old_txt = ''.join(df.loc[idx_merged].text.tolist()).strip()
    new_txt = ' '.join(r).strip()
    assert old_txt == new_txt

Chunk minimum length: 720.0

book 9781905832729 chapter ch_2

num_tok MOC 11
skipping [(9781905832729, 'ch_2', 'ch_2', 'ch_2')]

merging 11 + 198
num_tok MOC 209
skipping [(9781905832729, 'ch_2', 'ch_2', 'ch_2'), (9781905832729, 'ch_2', '1.i', '1.i')]

merging 209 + 609
num_tok MOC 818
summ [(9781905832729, 'ch_2', 'ch_2', 'ch_2'), (9781905832729, 'ch_2', '1.i', '1.i'), (9781905832729, 'ch_2', '1.ii', '1.ii')]


############################


book 9781905832729 chapter ch_3

num_tok MOC 85
skipping [(9781905832729, 'ch_3', '2.i', '2.i')]

merging 85 + 231
num_tok MOC 316
skipping [(9781905832729, 'ch_3', '2.i', '2.i'), (9781905832729, 'ch_3', '2.ii', '2.ii')]

merging 316 + 417
num_tok MOC 733
summ [(9781905832729, 'ch_3', '2.i', '2.i'), (9781905832729, 'ch_3', '2.ii', '2.ii'), (9781905832729, 'ch_3', '2.iii', '2.iii')]

num_tok MOC 383
skipping [(9781905832729, 'ch_3', '2.iv', '2.iv')]

merging 383 + 758
num_tok MOC 1141
chunking [(9781905832729, 'ch_3', '2.iv', '2.iv'), (978190583272



[1;30;43mStreaming output truncated to the last 5000 lines.[0m


############################


book 9781910797495 chapter chp4

num_tok MOC 53
skipping [(9781910797495, 'chp4', 'chp4', 'chp4')]

merging 53 + 329
num_tok MOC 382
skipping [(9781910797495, 'chp4', 'chp4', 'chp4'), (9781910797495, 'chp4', 'sect27', 'sect27')]

merging 382 + 500
num_tok MOC 882
summ [(9781910797495, 'chp4', 'chp4', 'chp4'), (9781910797495, 'chp4', 'sect27', 'sect27'), (9781910797495, 'chp4', 'sect28', 'sect28')]

num_tok MOC 300
skipping [(9781910797495, 'chp4', 'sect29', 'sect29')]

merging 300 + 427
num_tok MOC 727
summ [(9781910797495, 'chp4', 'sect29', 'sect29'), (9781910797495, 'chp4', 'sect30', 'sect30')]


############################


book 9781910797495 chapter chp5

num_tok MOC 375
skipping [(9781910797495, 'chp5', 'chp5', 'chp5')]

merging 375 + 1142
num_tok MOC 1517
chunking [(9781910797495, 'chp5', 'chp5', 'chp5'), (9781910797495, 'chp5', 'sect33', 'sect33')]

num_tok MOC 160
skipping [(9781

In [None]:
df_moc = df_moc.reset_index(level=[2, 3]).groupby(['book', 'chapter'], sort=False).agg({
    'merge': lambda m: list(m),
    'chunk': lambda c: list(c),
    'text': lambda t: list(t),
    'bullets': lambda b: list(b)[0]
})

#### Save new dataset

In [None]:
df_moc.to_csv(OUTPUT_PATH+'df.csv')

#### Create train, test, validation

Generate files:
* train.source
* train.target
* val.source
* val.target
* test.source
* test.target

Since we do not have a lot of samples, validation and test will be equal for us.

In [9]:
df_moc.bullets = df_moc.bullets.map(lambda b: ' '.join(b))

In [10]:
df_moc = df_moc.sample(frac=1, random_state=config.SEED)
df_moc['num_chunks'] = df_moc.text.map(len).cumsum()
tot_chunk = df_moc.num_chunks.iloc[-1]
split1 = np.where(df_moc.num_chunks > int(tot_chunk*0.8))[0][0]+1
split2 = np.where(df_moc.num_chunks > int(tot_chunk*0.9))[0][0]+1
print(split1, split2)

365 409


In [11]:
train, val, test =\
    df_moc.iloc[:split1].explode('text'),\
    df_moc.iloc[split1:split2].explode('text'),\
    df_moc.iloc[split2:].explode('text')

train.to_csv(OUTPUT_PATH+'train.csv')
val.to_csv(OUTPUT_PATH+'val.csv')
test.to_csv(OUTPUT_PATH+'test.csv')

In [13]:
with open(OUTPUT_PATH+'train.source', 'w') as tr_s,\
    open(OUTPUT_PATH+'train.target', 'w') as tr_t,\
    open(OUTPUT_PATH+'train.index', 'w') as tr_i:
    for idx, row in train[['text', 'bullets']].iterrows():
        tr_i.write(str(idx) + '\n')
        tr_s.write(row.text + '\n')
        tr_t.write(row.bullets + '\n')

In [14]:
with open(OUTPUT_PATH+'val.source', 'w') as va_s,\
    open(OUTPUT_PATH+'val.target', 'w') as va_t,\
    open(OUTPUT_PATH+'val.index', 'w') as va_i:
    for idx, row in val[['text', 'bullets']].iterrows():
        va_i.write(str(idx) + '\n')
        va_s.write(row.text + '\n')
        va_t.write(row.bullets + '\n')

In [15]:
with open(OUTPUT_PATH+'test.source', 'w') as te_s,\
    open(OUTPUT_PATH+'test.target', 'w') as te_t,\
    open(OUTPUT_PATH+'test.index', 'w') as te_i:
    for idx, row in test[['text', 'bullets']].iterrows():
        te_i.write(str(idx) + '\n')
        te_s.write(row.text + '\n')
        te_t.write(row.bullets + '\n')

### **Study new dataset**

In [None]:
np.mean(df_moc.text_num_tokens.tolist())

743.2905027932961

In [None]:
num_chunked, num_not_chunked = 0, 0
for idx in set(df_moc.index.tolist()):
    merge = eval(idx[2])
    chunk = idx[3]
    if chunk:
        num_chunked += len(merge)
    else:
        num_not_chunked += len(merge)

assert num_chunked + num_not_chunked == len(df)

print('percentage of chunked sections: %.2f %%'%(num_chunked/len(df)*100))

percentage of chunked sections: 75.24 %
