### **Config**

In [1]:
import os
import sys

sys.path.insert(0, '/home/marco/epfl/magma/')
import config

In [2]:
MODEL = 'pegasus'

RE_SPLITTER = '\n'              # do we split sentences of paragraphs?
                                # use '\.(?!\d)|\n' or '\n', respectively

# Output path
OUTPUT_PATH = config.MAGMA_DIR+'datasets/bullet_paragraph_rouge/'+MODEL+'/'
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

### **Init**

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import re
import pandas as pd
import gensim
from tqdm import tqdm
tqdm.pandas()

if 'pegasus' in MODEL:
    from transformers import PegasusTokenizer
    tokenizer =\
        PegasusTokenizer.from_pretrained('google/pegasus-large')
elif 'bart' in MODEL:
    from transformers import BartTokenizer
    tokenizer =\
        BartTokenizer.from_pretrained('facebook/bart-large-cnn')
elif 't5' in MODEL:
    from transformers import T5Tokenizer
    tokenizer=\
        T5Tokenizer.from_pretrained('t5-large')

### **Karger Books Base Dataset**

In [4]:
base_dataset = config.MAGMA_DIR+'datasets/karger_books_base/df.csv'
df = pd.read_csv(base_dataset)
df = df.set_index(['book', 'chapter', 'section', 'subsection'])
df.bullets = df.bullets.map(eval, na_action='ignore')

### **Preprocessing**

#### Preprocessing

* Split based on RE_SPLITTER
* Explode the dataset
* Remove unwanted chars at beginning or end of sentence
* Remove multiple spaces
* Remove long words (> config.TOKEN_MAX_LEN chars)
* Remove short sentences / paragraphs (< config.PARA_MIN_LEN tokens)

In [5]:
# Split in sentences / paragraphs based on RE_SPLITTER
df.text =\
    df.text.map(lambda x: [p.strip() for p in re.split(RE_SPLITTER, x) if p!=''],
                na_action='ignore')
    
# explode to get one row for each paragraph /sentence
df = df.explode('text')
df = df.rename(columns={'text': 'para'})
df = df.dropna()

# Remove unwanted chars at beginning or end of sentence
df.para = df.para.map(lambda p: p.lstrip('.,;:-)] \n'))
df.para = df.para.map(lambda p: p.rstrip('.,;:-([ \n'))

# Remove multiple spaces
df.para = df.para.map(lambda p:
    re.sub('\s+', ' ', p).strip())

# Remove long words (> config.TOKEN_MAX_LEN chars)
def para2words(para):
    return gensim.utils.simple_preprocess(
        para, deacc=True, max_len=config.TOKEN_MAX_LEN)
df['para_proc'] = df.para.map(para2words)

# Remove short sentences / paragraphs (< config.PARA_MIN_LEN tokens)
df.loc[df.para_proc.map(len) <\
    config.PARA_MIN_LEN, 'para_proc'] = np.nan

df = df.dropna()

### **Assign Bullets to Best Para and Expand Functions**

In [6]:
def assign_best_metric_para(df, col_metric):
    df['best_match'] = False

    for idx, para  in df.groupby('bullets').progress_apply(
        lambda g: g.iloc[g[col_metric].argmax()]).para.iteritems():
        
        df.loc[\
            (df['bullets'] == idx) &\
            (df['para'] == para), 'best_match'] = True
    
    para_too_short =\
        df[(df['compression_ratio'] >= config.MAX_RATIO) & df['best_match']]
    print('Percentage of paragraphs which are too short to be summarized: %.2f %%'\
        %(len(para_too_short)/len(df[df['best_match']])*100))
    
    return df

In [7]:
def expand_up_down(df, col_metric):
    # for each bullet
    for bul in tqdm(set(df.bullets.tolist())):
        df_bul = df[df['bullets'] == bul]
        
        # get book and chapter where this bullet is
        book = df_bul.index.get_level_values(0)[0]
        cpt = df_bul.index.get_level_values(1)[0]

        df_bul = df_bul.reset_index()
        # get best match index
        best_match_idx = np.where(df_bul['best_match'])[0][0]
        merged_para_idx = [best_match_idx]

        bul_num_tok = df_bul.loc[best_match_idx, 'bullets_num_tokens']
        merged_para_num_tok = df_bul.loc[best_match_idx, 'para_num_tokens']
        comp_ratio = df_bul.loc[best_match_idx, 'compression_ratio']
        num_bul_cpt = len(set(df.loc[book, cpt].bullets.tolist()))
        max_idx = len(df_bul)-1
        
        while comp_ratio > config.MAX_RATIO and\
            merged_para_num_tok < tokenizer.model_max_length:
            
            # if we already merged all possible paragraphs
            if (0 in merged_para_idx) and (max_idx in merged_para_idx):
                break
                
            # if we already merged the first paragraph
            elif 0 in merged_para_idx:
                new_para_idx = max(merged_para_idx)+1
                
            # if we already merged the last paragraph
            elif max_idx in merged_para_idx:
                new_para_idx = min(merged_para_idx)-1
                
            # otherwise check for best metric inclusion
            else:
                if df_bul.loc[min(merged_para_idx)-1, col_metric] <\
                    df_bul.loc[max(merged_para_idx)+1, col_metric]:
                    # merge down
                    new_para_idx = max(merged_para_idx)+1
                    
                else: # merge up
                    new_para_idx = min(merged_para_idx)-1       

            df_bul.loc[new_para_idx, 'best_match'] = True
            merged_para_idx.append(new_para_idx)
            
            merged_para_num_tok += df_bul.loc[new_para_idx, 'para_num_tokens']
            comp_ratio = bul_num_tok / merged_para_num_tok

        for p, b in zip(df_bul.loc[merged_para_idx]['para'].tolist(),
            df_bul.loc[merged_para_idx]['bullets'].tolist()):
            df.loc[(df['para'] == p) &
                (df['bullets'] == b), 'best_match'] = True
        
    return df

In [8]:
def print_stats(df):
    num_para_tot = 18822
    num_para_kept = np.sum(df.groupby('para')['best_match'].apply(np.any).tolist())
    print('%d out of %d paragraphs are considered using this method.'%(num_para_kept, num_para_tot), end=' ')
    print('Thus, %.2f %%'%(100*num_para_kept/num_para_tot))
    
    print()
    df_count_tokens = df.groupby('para', sort=False).agg({
        'best_match': lambda bm: np.any(list(bm)),
        'para_num_tokens': lambda pnt: list(pnt)[0]})
    num_tok_kept = df_count_tokens[df_count_tokens['best_match']].para_num_tokens.sum()
    num_tok_tot = df_count_tokens.para_num_tokens.sum()

    print('%d out of %d tokens are considered using this method.'%(num_tok_kept, num_tok_tot), end=' ')
    print('Thus, %.2f %%'%(100*num_tok_kept/num_tok_tot))

def print_stats_after_merge(df):
    para_too_short = df[df['compression_ratio'] > config.MAX_RATIO]
    print('Percentage of paragraphs which are too short to be summarized: %.2f %%'\
        %(len(para_too_short)/len(df)*100))
    
    print()
    print('Paragraphs which are too long to fit into the model: %d paragraphs.'%\
          len(df[df['para_num_tokens'] > tokenizer.model_max_length]))
    print(df[df['para_num_tokens'] > tokenizer.model_max_length])

### **Prepare Paragraphs**

In [9]:
df.para = df.para.map(lambda p: p+'.')
df = df.drop(columns='para_proc')
df = df.explode('bullets')

In [10]:
df['para_num_tokens'] = df.para.map(lambda p: len(tokenizer.tokenize(p)))
df['bullets_num_tokens'] = df.bullets.map(lambda b: len(tokenizer.tokenize(b)))

df['compression_ratio'] = df.bullets_num_tokens / df.para_num_tokens

### **Evaluate ROUGE recall**

In [11]:
from datasets import load_metric
metric = load_metric("rouge")

rouge_res =\
    df[['para', 'bullets']]\
    .progress_apply(lambda row:
    metric.compute(
        predictions = [row[0]],
        references = [row[1]],
        rouge_types = config.ROUGE_TYPES,
        use_agregator = False), axis=1)
    
for r in config.ROUGE_TYPES:
    df[r+'_recall'] =\
        rouge_res.map(lambda score: score[r][0][1])

100%|██████████| 114574/114574 [06:33<00:00, 291.18it/s]


### **Assign Bullets to ONE Paragraph and Expand**

In [12]:
df_one_para = assign_best_metric_para(df, config.ROUGE_TYPE_RECALL)

100%|██████████| 2556/2556 [00:01<00:00, 1457.41it/s]


Percentage of paragraphs which are too short to be summarized: 46.01 %


#### Expand: Merge Up or Down

Take one chapter into consideration, one bullet at a time. For each bullet, one paragraph is already assigned. For paragraphs which are too short compared to the bullet, merge up or down based on rouge recall of previous / next paragraph.

In [13]:
df_one_para = expand_up_down(df_one_para, 'rougeL_recall')

100%|██████████| 2556/2556 [01:35<00:00, 26.64it/s]


In [14]:
print_stats(df_one_para)

3720 out of 18822 paragraphs are considered using this method. Thus, 19.76 %

347364 out of 1229874 tokens are considered using this method. Thus, 28.24 %


In [15]:
df_one_para = df_one_para[df_one_para['best_match']].reset_index().groupby(['book', 'chapter', 'bullets'], sort=False).agg({
    'para': lambda p: ' '.join(list(p)),
    'para_num_tokens': sum,
    'bullets_num_tokens': lambda bnt: list(bnt)[0]
}).reset_index(level='bullets')
df_one_para = df_one_para.rename(columns={'para': 'text'})

df_one_para['compression_ratio'] = df_one_para.bullets_num_tokens / df_one_para.para_num_tokens

In [16]:
df_one_para[df_one_para['compression_ratio'] > config.MAX_RATIO]

Unnamed: 0_level_0,Unnamed: 1_level_0,bullets,text,para_num_tokens,bullets_num_tokens,compression_ratio
book,chapter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [17]:
para_too_short = df_one_para[df_one_para['compression_ratio'] > config.MAX_RATIO]
print('Percentage of paragraphs which are too short to be summarized: %.2f %%'\
    %(len(para_too_short)/len(df_one_para)*100))

Percentage of paragraphs which are too short to be summarized: 0.00 %


In [18]:
print(df_one_para[df_one_para['para_num_tokens'] > tokenizer.model_max_length])

Empty DataFrame
Columns: [bullets, text, para_num_tokens, bullets_num_tokens, compression_ratio]
Index: []


#### Save dataset

In [19]:
df_one_para.to_csv(OUTPUT_PATH+'df.csv')

#### Create train, test, validation (CC)

In [20]:
df_one_para = df_one_para.groupby(level=[0, 1], sort=False).agg({
    'bullets': lambda b: list(b),
    'text': lambda t: list(t),
})

In [21]:
df_one_para = df_one_para.sample(frac=1, random_state=config.SEED)
df_one_para['num_bulls'] = df_one_para.bullets.map(len).cumsum()
tot_bulls = df_one_para.num_bulls.iloc[-1]
split1 = np.where(df_one_para.num_bulls > int(tot_bulls*0.8))[0][0]+1
split2 = np.where(df_one_para.num_bulls > int(tot_bulls*0.9))[0][0]+1
print(split1, split2)

361 408


In [22]:
train, val, test =\
    df_one_para.iloc[:split1].explode('bullets'),\
    df_one_para.iloc[split1:split2].explode('bullets'),\
    df_one_para.iloc[split2:].explode('bullets')

train['text'] = df_one_para.iloc[:split1].explode('text')['text']
val['text'] = df_one_para.iloc[split1:split2].explode('text')['text']
test['text'] = df_one_para.iloc[split2:].explode('text')['text']

train.to_csv(OUTPUT_PATH+'train.csv')
val.to_csv(OUTPUT_PATH+'val.csv')
test.to_csv(OUTPUT_PATH+'test.csv')

In [23]:
with open(OUTPUT_PATH+'train.source', 'w') as tr_s,\
    open(OUTPUT_PATH+'train.target', 'w') as tr_t,\
    open(OUTPUT_PATH+'train.index', 'w') as tr_i:
    for idx, row in train[['text', 'bullets']].iterrows():
        tr_i.write(str(idx) + '\n')
        tr_s.write(row.text + '\n')
        tr_t.write(row.bullets + '\n')

In [24]:
with open(OUTPUT_PATH+'val.source', 'w') as va_s,\
    open(OUTPUT_PATH+'val.target', 'w') as va_t,\
    open(OUTPUT_PATH+'val.index', 'w') as va_i:
    for idx, row in val[['text', 'bullets']].iterrows():
        va_i.write(str(idx) + '\n')
        va_s.write(row.text + '\n')
        va_t.write(row.bullets + '\n')

In [25]:
with open(OUTPUT_PATH+'test.source', 'w') as te_s,\
    open(OUTPUT_PATH+'test.target', 'w') as te_t,\
    open(OUTPUT_PATH+'test.index', 'w') as te_i:
    for idx, row in test[['text', 'bullets']].iterrows():
        te_i.write(str(idx) + '\n')
        te_s.write(row.text + '\n')
        te_t.write(row.bullets + '\n')