#### For Colab

In [None]:
"""
function ClickConnect(){
    console.log("Working");
    document.querySelector("colab-toolbar-button").click() 
}
var i = setInterval(ClickConnect, 900000)
clearInterval(i)
"""

'\nfunction ClickConnect(){\n    console.log("Working");\n    document.querySelector("colab-toolbar-button").click() \n}\nvar i = setInterval(ClickConnect, 900000)\nclearInterval(i)\n'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
drive_dir = '/content/drive/My Drive/MAGMA: Summarization/'

#### Install Libraries

In [None]:
!pip install transformers==4.1.1
!pip install -U sentencepiece!=0.1.92
!pip install -U datasets
!pip install rouge_score
!pip install -U gensim

Collecting transformers==4.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 6.2MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 33.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 41.5MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=8c69e659572b

### **Config**

In [None]:
import os
import sys

sys.path.insert(0, drive_dir)
import config

In [None]:
# General configurations

MODEL = 'bart'

RE_SPLITTER = '\.(?!\d)|\n'     # do we split sentences of paragraphs?
                                # use '\.|\n' or '\n', respectively

TOKEN_MAX_LEN = 99              # max length of a word
PARA_MIN_LENGTH = 2             # minimum length for a sentence or
                                # a paragraph, in tokens

### **Init**

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import re
import pandas as pd
import gensim
from tqdm import tqdm
tqdm.pandas()

if 'pegasus' in MODEL:
    from transformers import PegasusTokenizer
    tokenizer =\
        PegasusTokenizer.from_pretrained('google/pegasus-large')
elif 'bart' in MODEL:
    from transformers import BartTokenizer
    tokenizer =\
        BartTokenizer.from_pretrained('facebook/bart-large-cnn')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




## **Assign Bullets**

### **Load Datasets**

In [None]:
df_cc = pd.read_csv(config.DATASET_PATH+'karger_books_chunk_chapter/'+MODEL+'/df.csv')
df_cc = df_cc.set_index(['book', 'chapter'])
df_cc.bullets = df_cc.bullets.map(eval)
df_cc.text = df_cc.text.map(eval)
df_cc = df_cc.explode('text')

In [None]:
df_moc = pd.read_csv(config.DATASET_PATH+'karger_books_moc/'+MODEL+'/df.csv')
df_moc = df_moc.set_index(['book', 'chapter', 'merge'])
df_moc.bullets = df_moc.bullets.map(eval)
df_moc.text = df_moc.text.map(eval)
df_moc = df_moc.explode('text')

### **Evaluate ROUGE recall**

In [None]:
from datasets import load_metric
metric = load_metric("rouge")

def calculate_rouge(df, col_text, col_bullets):
    # explode the bullets list and assign ROUGE for each bullet

    df_expl = df.explode(col_bullets)

    rouge_res =\
        df_expl[[col_text, col_bullets]]\
        .progress_apply(lambda row:
        metric.compute(
            predictions = [row[0]],
            references = [row[1]],
            rouge_types = config.ROUGE_TYPES,
            use_agregator = False), axis=1)
        
    for r in config.ROUGE_TYPES:
        df_expl[r+'_recall'] =\
            rouge_res.map(lambda score: score[r][0][1])

    return df_expl


df_cc = calculate_rouge(df_cc, 'text', 'bullets')
df_moc = calculate_rouge(df_moc, 'text', 'bullets')

100%|██████████| 9984/9984 [01:59<00:00, 83.25it/s]
100%|██████████| 10947/10947 [01:59<00:00, 91.65it/s] 


### **Keep highest recall from bullets**

Some chunks will be neglected (around 30%).

In [None]:
def assign_highest_recall_text(df):
    df['best_match'] = False

    for idx, text  in df.groupby('bullets').progress_apply(
        lambda g: g.iloc[g[config.ROUGE_TYPE_RECALL].argmax()]).text.iteritems():
        
        df.loc[\
            (df['bullets'] == idx) &\
            (df['text'] == text), 'best_match'] = True
    
    return df

df_cc_from_bull = assign_highest_recall_text(df_cc)
df_moc_from_bull = assign_highest_recall_text(df_moc)

100%|██████████| 2556/2556 [00:01<00:00, 1851.64it/s]
100%|██████████| 2556/2556 [00:01<00:00, 1630.11it/s]


In [None]:
print('Percentage of chunks with at least one bullet point in CC: %.2f %%'%\
    (100*np.sum(df_cc_from_bull.groupby('text')['best_match'].apply(np.any).tolist())\
    /len(df_cc_from_bull.groupby('text')['best_match'].apply(np.any))))

Percentage of chunks with at least one bullet point in CC: 73.50 %


In [None]:
print('Percentage of chunks with at least one bullet point in MoC: %.2f %%'%\
    (100*np.sum(df_moc_from_bull.groupby('text')['best_match'].apply(np.any).tolist())\
    /len(df_moc_from_bull.groupby('text')['best_match'].apply(np.any))))

Percentage of chunks with at least one bullet point in MoC: 68.86 %


In [None]:
df_cc_from_bull = df_cc_from_bull[df_cc_from_bull['best_match']]
df_moc_from_bull = df_moc_from_bull[df_moc_from_bull['best_match']]

In [None]:
assert len(df_cc_from_bull) == len(df_cc.groupby('bullets').count())
assert len(df_moc_from_bull) == len(df_moc.groupby('bullets').count())

#### Save dataset

In [None]:
df_cc_from_bull.to_csv(config.DATASET_PATH+'karger_books_chunk_chapter/assign_bullets/'+MODEL+'/df.csv')
df_moc_from_bull.to_csv(config.DATASET_PATH+'karger_books_moc/assign_bullets/'+MODEL+'df.csv')

#### Create train, test, validation (CC)

In [None]:
op = config.DATASET_PATH+'karger_books_chunk_chapter/assign_bullets/'+MODEL+'/'
if not os.path.exists(op):
    os.makedirs(op)

In [None]:
df_cc_from_bull = df_cc_from_bull.groupby(level=[0, 1], sort=False).agg({
    'bullets': lambda b: list(b),
    'text': lambda t: list(t),
})

In [None]:
df_cc_from_bull = df_cc_from_bull.sample(frac=1, random_state=config.SEED)
df_cc_from_bull['num_bulls'] = df_cc_from_bull.bullets.map(len).cumsum()
tot_bulls = df_cc_from_bull.num_bulls.iloc[-1]
split1 = np.where(df_cc_from_bull.num_bulls > int(tot_bulls*0.8))[0][0]+1
split2 = np.where(df_cc_from_bull.num_bulls > int(tot_bulls*0.9))[0][0]+1
print(split1, split2)

361 408


In [None]:
train, val, test =\
    df_cc_from_bull.iloc[:split1].explode('bullets'),\
    df_cc_from_bull.iloc[split1:split2].explode('bullets'),\
    df_cc_from_bull.iloc[split2:].explode('bullets')

train['text'] = df_cc_from_bull.iloc[:split1].explode('text')['text']
val['text'] = df_cc_from_bull.iloc[split1:split2].explode('text')['text']
test['text'] = df_cc_from_bull.iloc[split2:].explode('text')['text']

train.to_csv(op+'train.csv')
val.to_csv(op+'val.csv')
test.to_csv(op+'test.csv')

In [None]:
with open(op+'train.source', 'w') as tr_s,\
    open(op+'train.target', 'w') as tr_t,\
    open(op+'train.index', 'w') as tr_i:
    for idx, row in train[['text', 'bullets']].iterrows():
        tr_i.write(str(idx) + '\n')
        tr_s.write(row.text + '\n')
        tr_t.write(row.bullets + '\n')

In [None]:
with open(op+'val.source', 'w') as va_s,\
    open(op+'val.target', 'w') as va_t,\
    open(op+'val.index', 'w') as va_i:
    for idx, row in val[['text', 'bullets']].iterrows():
        va_i.write(str(idx) + '\n')
        va_s.write(row.text + '\n')
        va_t.write(row.bullets + '\n')

In [None]:
with open(op+'test.source', 'w') as te_s,\
    open(op+'test.target', 'w') as te_t,\
    open(op+'test.index', 'w') as te_i:
    for idx, row in test[['text', 'bullets']].iterrows():
        te_i.write(str(idx) + '\n')
        te_s.write(row.text + '\n')
        te_t.write(row.bullets + '\n')

#### Create train, test, validation (MoC)

In [None]:
op = config.DATASET_PATH+'karger_books_moc/assign_bullets/'+MODEL+'/'
if not os.path.exists(op):
    os.makedirs(op)

In [None]:
df_moc_from_bull = df_moc_from_bull.groupby(level=[0, 1], sort=False).agg({
    'bullets': lambda b: list(b),
    'text': lambda t: list(t),
})

In [None]:
df_moc_from_bull = df_moc_from_bull.sample(frac=1, random_state=config.SEED)
df_moc_from_bull['num_bulls'] = df_moc_from_bull.bullets.map(len).cumsum()
tot_bulls = df_moc_from_bull.num_bulls.iloc[-1]
split1 = np.where(df_moc_from_bull.num_bulls > int(tot_bulls*0.8))[0][0]+1
split2 = np.where(df_moc_from_bull.num_bulls > int(tot_bulls*0.9))[0][0]+1
print(split1, split2)

361 408


In [None]:
train, val, test =\
    df_moc_from_bull.iloc[:split1].explode('bullets'),\
    df_moc_from_bull.iloc[split1:split2].explode('bullets'),\
    df_moc_from_bull.iloc[split2:].explode('bullets')

train['text'] = df_moc_from_bull.iloc[:split1].explode('text')['text']
val['text'] = df_moc_from_bull.iloc[split1:split2].explode('text')['text']
test['text'] = df_moc_from_bull.iloc[split2:].explode('text')['text']

train.to_csv(op+'train.csv')
val.to_csv(op+'val.csv')
test.to_csv(op+'test.csv')

In [None]:
with open(op+'train.source', 'w') as tr_s,\
    open(op+'train.target', 'w') as tr_t,\
    open(op+'train.index', 'w') as tr_i:
    for idx, row in train[['text', 'bullets']].iterrows():
        tr_i.write(str(idx) + '\n')
        tr_s.write(row.text + '\n')
        tr_t.write(row.bullets + '\n')

In [None]:
with open(op+'val.source', 'w') as va_s,\
    open(op+'val.target', 'w') as va_t,\
    open(op+'val.index', 'w') as va_i:
    for idx, row in val[['text', 'bullets']].iterrows():
        va_i.write(str(idx) + '\n')
        va_s.write(row.text + '\n')
        va_t.write(row.bullets + '\n')

In [None]:
with open(op+'test.source', 'w') as te_s,\
    open(op+'test.target', 'w') as te_t,\
    open(op+'test.index', 'w') as te_i:
    for idx, row in test[['text', 'bullets']].iterrows():
        te_i.write(str(idx) + '\n')
        te_s.write(row.text + '\n')
        te_t.write(row.bullets + '\n')