In [7]:
magma_dir = '/home/marco/epfl/magma/'

### **Config**

In [8]:
import os
import sys

sys.path.insert(0, magma_dir)
import config

In [9]:
MODEL = 't5'

RE_SPLITTER = '\n'              # do we split sentences of paragraphs?
                                # use '\.(?!\d)|\n' or '\n', respectively

TOKEN_MAX_LEN = 99              # max length of a word
PARA_MIN_LENGTH = 2             # minimum length for a sentence or
                                # a paragraph, in tokens

RECALL_THRESHOLD = 0.7

# Output path
OUTPUT_PATH = magma_dir+'datasets/karger_books_para_rouge/'+MODEL+'/'
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

### **Init**

In [10]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import re
import pandas as pd
import gensim
from tqdm import tqdm
tqdm.pandas()

if 'pegasus' in MODEL:
    from transformers import PegasusTokenizer
    tokenizer =\
        PegasusTokenizer.from_pretrained('google/pegasus-large')
elif 'bart' in MODEL:
    from transformers import BartTokenizer
    tokenizer =\
        BartTokenizer.from_pretrained('facebook/bart-large-cnn')
elif 't5' in MODEL:
    from transformers import T5Tokenizer
    tokenizer =\
        T5Tokenizer.from_pretrained('t5-large')

### **Karger Books Base Dataset**

In [11]:
base_dataset = magma_dir+'datasets/karger_books_base/df.csv'
df = pd.read_csv(base_dataset)
df = df.set_index(['book', 'chapter', 'section', 'subsection'])
df.bullets = df.bullets.map(eval, na_action='ignore')

## **Paragraph Assign Bullets**

### **Preprocessing**

#### Preprocessing

* Split based on RE_SPLITTER
* Explode the dataset
* Remove unwanted chars at beginning or end of sentence
* Remove multiple spaces
* Remove long words (> TOKEN_MAX_LEN chars)
* Remove short sentences / paragraphs (< PARA_MIN_LENGTH tokens)

In [12]:
# Split in sentences / paragraphs based on RE_SPLITTER
df.text =\
    df.text.map(lambda x: [p.strip() for p in re.split(RE_SPLITTER, x) if p!=''],
                na_action='ignore')
    
# explode to get one row for each paragraph /sentence
df = df.explode('text')
df = df.rename(columns={'text': 'para'})
df = df.dropna()

# Remove unwanted chars at beginning or end of sentence
df.para = df.para.map(lambda p: p.lstrip('.,;:-)] \n'))
df.para = df.para.map(lambda p: p.rstrip('.,;:-([ \n'))

# Remove multiple spaces
df.para = df.para.map(lambda p:
    re.sub('\s+', ' ', p).strip())

# Remove long words (> TOKEN_MAX_LEN chars)
def para2words(para):
    return gensim.utils.simple_preprocess(
        para, deacc=True, max_len=TOKEN_MAX_LEN)
df['para_proc'] = df.para.map(para2words)

# Remove short sentences / paragraphs (< PARA_MIN_LENGTH tokens)
df.loc[df.para_proc.map(len) <\
    PARA_MIN_LENGTH, 'para_proc'] = np.nan

df = df.dropna()

### **Prepare Paragraphs**

In [13]:
df.para = df.para.map(lambda p: p+'.')
df = df.drop(columns='para_proc')
df = df.explode('bullets')

In [14]:
df['para_num_tokens'] = df.para.map(lambda p: len(tokenizer.tokenize(p)))
df['bullets_num_tokens'] = df.bullets.map(lambda b: len(tokenizer.tokenize(b)))

df['compression_ratio'] = df.bullets_num_tokens / df.para_num_tokens

### **Evaluate ROUGE recall**

In [16]:
from datasets import load_metric
metric = load_metric("rouge")

rouge_res =\
    df[['para', 'bullets']]\
    .progress_apply(lambda row:
    metric.compute(
        predictions = [row[0]],
        references = [row[1]],
        rouge_types = config.ROUGE_TYPES,
        use_agregator = False), axis=1)
    
for r in config.ROUGE_TYPES:
    df[r+'_recall'] =\
        rouge_res.map(lambda score: score[r][0][1])

100%|██████████| 114574/114574 [07:36<00:00, 251.03it/s]


### **Assign Bullets to ONE Paragraph and Expand**

In [17]:
def assign_highest_recall_para(df):
    df_one_para = df.copy()

    df_one_para['best_match'] = False

    for idx, para  in df_one_para.groupby('bullets').apply(
        lambda g: g.iloc[g[config.ROUGE_TYPE_RECALL].argmax()]).para.iteritems():
        
        df_one_para.loc[\
            (df_one_para['bullets'] == idx) &\
            (df_one_para['para'] == para), 'best_match'] = True
    
    return df_one_para

df_one_para = assign_highest_recall_para(df)

In [18]:
para_too_short = df_one_para[(df_one_para['compression_ratio'] >= config.MAX_RATIO) & df_one_para['best_match']]
print('Percentage of paragraphs which are too short to be summarized: %.2f %%'\
    %(len(para_too_short)/len(df_one_para[df_one_para['best_match']])*100))

Percentage of paragraphs which are too short to be summarized: 47.30 %


#### Expand: Merge Up or Down

Take one chapter into consideration, one bullet at a time. For each bullet, one paragraph is already assigned. For paragraphs which are too short compared to the bullet, merge up or down based on rouge recall of previous / next paragraph.

In [19]:
def expand_up_down(df, col_metric, verbose=False):
    # for each bullet
    for bul in tqdm(set(df.bullets.tolist())):
        if verbose : print(bul)
        df_bul = df[df['bullets'] == bul]
        
        # get book and chapter where this bullet is
        book = df_bul.index.get_level_values(0)[0]
        cpt = df_bul.index.get_level_values(1)[0]

        df_bul = df_bul.reset_index()
        if verbose : print(df_bul[df_bul['best_match']].para.tolist())
        # get best match index
        best_match_idx = np.where(df_bul['best_match'])[0][0]
        merged_para_idx = [best_match_idx]

        bul_num_tok = df_bul.loc[best_match_idx].bullets_num_tokens
        comp_ratio = df_bul.loc[best_match_idx].compression_ratio
        if verbose:
            print('Book %s, Chapter %s'%(book, cpt))
            print('Paragraphs in this chapter:', len(df_bul))
            print('Location of best_bul index:', best_match_idx)
            print('Compression ratio before merging: %.2f %%'%comp_ratio)
            print()
        while comp_ratio > config.MAX_RATIO:
            if 0 in merged_para_idx:
                if verbose : print('merge down')
                new_para_idx = max(merged_para_idx)+1
                df_bul.loc[new_para_idx, 'best_match'] = True
                merged_para_idx.append(new_para_idx)

            elif (len(df_bul)-1) in merged_para_idx:
                if verbose : print('merge up')
                new_para_idx = min(merged_para_idx)-1
                df_bul.loc[new_para_idx, 'best_match'] = True
                merged_para_idx.append(new_para_idx)

            else:
                if verbose : print('based on metric %s '%col_metric, end='')

                if df_bul.loc[min(merged_para_idx)-1, col_metric] <\
                    df_bul.loc[max(merged_para_idx)+1, col_metric]:
                    if verbose : print('merge down')
                    new_para_idx = max(merged_para_idx)+1
                    df_bul.loc[new_para_idx, 'best_match'] = True
                    merged_para_idx.append(new_para_idx)
                else:
                    if verbose : print('merge up')
                    new_para_idx = min(merged_para_idx)-1
                    df_bul.loc[new_para_idx, 'best_match'] = True
                    merged_para_idx.append(new_para_idx)         

            merged_para_len = np.sum(df_bul[df_bul['best_match']].para_num_tokens.tolist())
            comp_ratio = bul_num_tok / merged_para_len
            if merged_para_len > tokenizer.model_max_length:
                break
            if verbose:
                print(df_bul[df_bul['best_match']].para.tolist())
                print('Compression ratio: %.2f %%'%comp_ratio)
                print()

        for p, b in zip(df_bul.loc[merged_para_idx]['para'].tolist(),
            df_bul.loc[merged_para_idx]['bullets'].tolist()):
            df.loc[(df['para'] == p) &
                (df['bullets'] == b), 'best_match'] = True
        if verbose : print()
        
    return df

In [20]:
df_one_para = expand_up_down(df_one_para, 'rougeL_recall')

100%|██████████| 2556/2556 [03:29<00:00, 12.18it/s]


In [21]:
num_para_tot = 18822
num_para_kept = np.sum(df_one_para.groupby('para')['best_match'].apply(np.any).tolist())
print('%d out of %d paragraphs are considered using this method.'%(num_para_kept, num_para_tot), end=' ')
print('Thus, %.2f %%'%(100*num_para_kept/num_para_tot))

3770 out of 18822 paragraphs are considered using this method. Thus, 20.03 %


In [22]:
df_count_tokens = df_one_para.groupby('para', sort=False).agg({
    'best_match': lambda bm: np.any(list(bm))}).reset_index()
df_count_tokens['para_len'] = df_count_tokens.para.map(len)
num_letters_kept = df_count_tokens[df_count_tokens['best_match']].para_len.sum()
num_letters_tot = df_count_tokens.para_len.sum()

print('%d out of %d letters are considered using this method.'%(num_letters_kept, num_letters_tot), end=' ')
print('Thus, %.2f %%'%(100*num_letters_kept/num_letters_tot))

1831736 out of 6407951 letters are considered using this method. Thus, 28.59 %


In [23]:
df_count_tokens = df_one_para.groupby('para', sort=False).agg({
    'best_match': lambda bm: np.any(list(bm)),
    'para_num_tokens': lambda pnt: list(pnt)[0]})
num_tok_kept = df_count_tokens[df_count_tokens['best_match']].para_num_tokens.sum()
num_tok_tot = df_count_tokens.para_num_tokens.sum()

print('%d out of %d tokens are considered using this method.'%(num_tok_kept, num_tok_tot), end=' ')
print('Thus, %.2f %%'%(100*num_tok_kept/num_tok_tot))

423168 out of 1493837 tokens are considered using this method. Thus, 28.33 %


In [24]:
df_one_para = df_one_para[df_one_para['best_match']].reset_index().groupby(['book', 'chapter', 'bullets'], sort=False).agg({
    'para': lambda p: ' '.join(list(p)),
    'para_num_tokens': sum,
    'bullets_num_tokens': lambda bnt: list(bnt)[0]
}).reset_index(level='bullets')
df_one_para = df_one_para.rename(columns={'para': 'text'})

df_one_para['compression_ratio'] = df_one_para.bullets_num_tokens / df_one_para.para_num_tokens

In [25]:
df_one_para[df_one_para['compression_ratio'] > config.MAX_RATIO]

Unnamed: 0_level_0,Unnamed: 1_level_0,bullets,text,para_num_tokens,bullets_num_tokens,compression_ratio
book,chapter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9781908541727,ch09,"Refer urgently (the same day), patients with o...",Patients with seventh nerve palsy need to be s...,528,181,0.342803
9781908541727,ch10,"Refer within 1 week, any child with:- squint (...",Congenital nasolacrimal duct obstruction is ve...,516,149,0.28876
9781908541727,ch10,"Refer urgently, to be seen the same day, if yo...",Young children are unable to complain of poor ...,592,249,0.420608
9781908541727,ch11,"Refer urgently, to be seen by an ophthalmologi...",A patient suspected of having any of these inj...,530,280,0.528302
9781910797495,chp7,"Pharmacotherapy, based on only a small number ...",Central pain is defined by the International A...,521,142,0.272553
9781910797723,chp4,Immune checkpoint inhibitors have been approve...,Immune checkpoint molecules are cell surface r...,555,145,0.261261
9783318067095,ch9,Validation ensures that the technology is meas...,"In other instances, a digital assessment may m...",575,197,0.342609


In [26]:
para_too_short = df_one_para[df_one_para['compression_ratio'] > config.MAX_RATIO]
print('Percentage of paragraphs which are too short to be summarized: %.2f %%'\
    %(len(para_too_short)/len(df_one_para)*100))

Percentage of paragraphs which are too short to be summarized: 0.27 %


In [27]:
print(df_one_para[df_one_para['para_num_tokens'] > tokenizer.model_max_length])

                                                                 bullets  \
book          chapter                                                      
9781908541024 ch_6     Concurrent and adjuvant temozolomide chemother...   
              ch_6     Observation may be an appropriate initial stra...   
9781908541086 ch_11    Eating disorders not otherwise specified is th...   
9781908541277 ch_11    Oral contraceptives containing at least 50 µg ...   
9781908541727 ch07     Urgently refer patients who present with any o...   
              ch09     Refer urgently (the same day), patients with o...   
              ch10     Refer within 1 week, any child with:- squint (...   
              ch10     Refer urgently, to be seen the same day, if yo...   
              ch11     Refer urgently, to be seen by an ophthalmologi...   
9781910797181 ch07     Nortriptyline and cytisine are both effective ...   
9781910797211 ch02     At present, there are no guidelines for geneti...   
978191079729

#### Save dataset

In [28]:
df_one_para.to_csv(OUTPUT_PATH+'df.csv')

#### Create train, test, validation (CC)

In [29]:
df_one_para = df_one_para.groupby(level=[0, 1], sort=False).agg({
    'bullets': lambda b: list(b),
    'text': lambda t: list(t),
})

In [30]:
df_one_para = df_one_para.sample(frac=1, random_state=config.SEED)
df_one_para['num_bulls'] = df_one_para.bullets.map(len).cumsum()
tot_bulls = df_one_para.num_bulls.iloc[-1]
split1 = np.where(df_one_para.num_bulls > int(tot_bulls*0.8))[0][0]+1
split2 = np.where(df_one_para.num_bulls > int(tot_bulls*0.9))[0][0]+1
print(split1, split2)

361 408


In [31]:
train, val, test =\
    df_one_para.iloc[:split1].explode('bullets'),\
    df_one_para.iloc[split1:split2].explode('bullets'),\
    df_one_para.iloc[split2:].explode('bullets')

train['text'] = df_one_para.iloc[:split1].explode('text')['text']
val['text'] = df_one_para.iloc[split1:split2].explode('text')['text']
test['text'] = df_one_para.iloc[split2:].explode('text')['text']

train.to_csv(OUTPUT_PATH+'train.csv')
val.to_csv(OUTPUT_PATH+'val.csv')
test.to_csv(OUTPUT_PATH+'test.csv')

In [32]:
with open(OUTPUT_PATH+'train.source', 'w') as tr_s,\
    open(OUTPUT_PATH+'train.target', 'w') as tr_t,\
    open(OUTPUT_PATH+'train.index', 'w') as tr_i:
    for idx, row in train[['text', 'bullets']].iterrows():
        tr_i.write(str(idx) + '\n')
        tr_s.write(row.text + '\n')
        tr_t.write(row.bullets + '\n')

In [33]:
with open(OUTPUT_PATH+'val.source', 'w') as va_s,\
    open(OUTPUT_PATH+'val.target', 'w') as va_t,\
    open(OUTPUT_PATH+'val.index', 'w') as va_i:
    for idx, row in val[['text', 'bullets']].iterrows():
        va_i.write(str(idx) + '\n')
        va_s.write(row.text + '\n')
        va_t.write(row.bullets + '\n')

In [34]:
with open(OUTPUT_PATH+'test.source', 'w') as te_s,\
    open(OUTPUT_PATH+'test.target', 'w') as te_t,\
    open(OUTPUT_PATH+'test.index', 'w') as te_i:
    for idx, row in test[['text', 'bullets']].iterrows():
        te_i.write(str(idx) + '\n')
        te_s.write(row.text + '\n')
        te_t.write(row.bullets + '\n')

### **Assign Bullets to Paragraphs (Threshold)**

In [27]:
df_thresh_para = assign_highest_recall_para(df)
num_best_para = len(df_thresh_para[df_thresh_para['best_match']])

In [28]:
df_thresh_para.loc[df_thresh_para[config.ROUGE_TYPE_RECALL] > RECALL_THRESHOLD, 'best_match'] = True
num_thresh_para = len(df_thresh_para[df_thresh_para['best_match']])

In [29]:
print('Considering ONLY the best paragraph for each bullet, we keep %d paragraphs.\n'%num_best_para)
print('Considering ALSO the paragraphs with recall > %.2f, we keep %d paragraphs.'%(RECALL_THRESHOLD, num_thresh_para))
print('In the first option, we lose only %.2f %% of the information with respect to this method.'%\
    (100*(num_thresh_para-num_best_para)/num_best_para))

Considering ONLY the best paragraph for each bullet, we keep 2556 paragraphs.

Considering ALSO the paragraphs with recall > 0.70, we keep 2586 paragraphs.
In the first option, we lose only 1.17 % of the information with respect to this method.


In [30]:
df_thresh_para.loc[df_thresh_para[config.ROUGE_TYPE_RECALL] > 0.5, 'best_match'] = True
num_thresh_para_lower = len(df_thresh_para[df_thresh_para['best_match']])

print('Lowering the threshold to 0.50, we keep %d paragraphs.'%(num_thresh_para_lower))
print('In the first option, we lose %.2f %% of the information with respect to this method.\n'%\
    (100*(num_thresh_para_lower-num_best_para)/num_best_para))
print('This number is still quite low to decide to consider these paragraphs too.')

Lowering the threshold to 0.50, we keep 2943 paragraphs.
In the first option, we lose 15.14 % of the information with respect to this method.

This number is still quite low to decide to consider these paragraphs too.


### **Study Data**

In [31]:
df = pd.read_csv(OUTPUT_PATH+'df.csv').set_index(['book', 'chapter'])
df.para_num_tokens.max()

907