### **Config**

In [2]:
import os
import sys

sys.path.insert(0, '/home/marco/epfl/magma/')
import config

In [3]:
# General configurations

MODEL = 'pegasus'

RE_SPLITTER = '\.(?!\d)|\n'     # do we split sentences of paragraphs?
                                # use '\.|\n' or '\n', respectively

### **Init**

In [4]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import re
import pandas as pd
import gensim
from tqdm import tqdm
tqdm.pandas()

if 'pegasus' in MODEL:
    from transformers import PegasusTokenizer
    tokenizer =\
        PegasusTokenizer.from_pretrained('google/pegasus-large')
elif 'bart' in MODEL:
    from transformers import BartTokenizer
    tokenizer =\
        BartTokenizer.from_pretrained('facebook/bart-large-cnn')
elif 't5' in MODEL:
    from transformers import T5Tokenizer
    tokenizer=\
        T5Tokenizer.from_pretrained('t5-large')

## **Assign Bullets**

### **Load Datasets**

In [5]:
df_cc = pd.read_csv(config.MAGMA_DIR+'datasets/karger_books_chunk_chapter/'+MODEL+'/df.csv')
df_cc = df_cc.set_index(['book', 'chapter'])
df_cc.bullets = df_cc.bullets.map(eval)
df_cc.text = df_cc.text.map(eval)
df_cc = df_cc.explode('text')

In [6]:
df_moc = pd.read_csv(config.MAGMA_DIR+'datasets/karger_books_moc/'+MODEL+'/df.csv')
df_moc = df_moc.set_index(['book', 'chapter', 'merge'])
df_moc.bullets = df_moc.bullets.map(eval)
df_moc.text = df_moc.text.map(eval)
df_moc = df_moc.explode('text')

### **Evaluate ROUGE recall**

In [7]:
from datasets import load_metric
metric = load_metric("rouge")

def calculate_rouge(df, col_text, col_bullets):
    # explode the bullets list and assign ROUGE for each bullet

    df_expl = df.explode(col_bullets)

    rouge_res =\
        df_expl[[col_text, col_bullets]]\
        .progress_apply(lambda row:
        metric.compute(
            predictions = [row[0]],
            references = [row[1]],
            rouge_types = config.ROUGE_TYPES,
            use_agregator = False), axis=1)
        
    for r in config.ROUGE_TYPES:
        df_expl[r+'_recall'] =\
            rouge_res.map(lambda score: score[r][0][1])

    return df_expl


df_cc = calculate_rouge(df_cc, 'text', 'bullets')
df_moc = calculate_rouge(df_moc, 'text', 'bullets')

100%|██████████| 11094/11094 [01:30<00:00, 122.83it/s]
100%|██████████| 12085/12085 [01:42<00:00, 117.38it/s]


### **Keep highest recall from bullets**

Some chunks will be neglected (around 30%).

In [8]:
def assign_highest_recall_text(df):
    df['best_match'] = False

    for idx, text  in df.groupby('bullets').progress_apply(
        lambda g: g.iloc[g[config.ROUGE_TYPE_RECALL].argmax()]).text.iteritems():
        
        df.loc[\
            (df['bullets'] == idx) &\
            (df['text'] == text), 'best_match'] = True
    
    return df

df_cc_from_bull = assign_highest_recall_text(df_cc)
df_moc_from_bull = assign_highest_recall_text(df_moc)

100%|██████████| 2556/2556 [00:01<00:00, 1786.61it/s]
100%|██████████| 2556/2556 [00:01<00:00, 1881.25it/s]


In [9]:
print('Percentage of chunks with at least one bullet point in CC: %.2f %%'%\
    (100*np.sum(df_cc_from_bull.groupby('text')['best_match'].apply(np.any).tolist())\
    /len(df_cc_from_bull.groupby('text')['best_match'].apply(np.any))))

Percentage of chunks with at least one bullet point in CC: 70.94 %


In [10]:
print('Percentage of chunks with at least one bullet point in MoC: %.2f %%'%\
    (100*np.sum(df_moc_from_bull.groupby('text')['best_match'].apply(np.any).tolist())\
    /len(df_moc_from_bull.groupby('text')['best_match'].apply(np.any))))

Percentage of chunks with at least one bullet point in MoC: 67.04 %


In [11]:
tmp = df_cc_from_bull.groupby(['book', 'chapter', 'text'], sort=False).agg({
    'best_match': lambda bm: list(bm)
}).best_match.map(np.sum).reset_index(level=2)
tmp = tmp[tmp['best_match'] != 0]
tmp.best_match.describe()

count    1301.000000
mean        1.964643
std         1.132857
min         1.000000
25%         1.000000
50%         2.000000
75%         2.000000
max        11.000000
Name: best_match, dtype: float64

In [12]:
df_cc_from_bull = df_cc_from_bull[df_cc_from_bull['best_match']]
df_moc_from_bull = df_moc_from_bull[df_moc_from_bull['best_match']]

In [13]:
assert len(df_cc_from_bull) == len(df_cc.groupby('bullets').count())
assert len(df_moc_from_bull) == len(df_moc.groupby('bullets').count())

#### Save dataset

In [14]:
if not os.path.exists(config.MAGMA_DIR+'datasets/karger_books_chunk_chapter/assign_bullets/'+MODEL+'/'):
    os.makedirs(config.MAGMA_DIR+'datasets/karger_books_chunk_chapter/assign_bullets/'+MODEL+'/')
    
if not os.path.exists(config.MAGMA_DIR+'datasets/karger_books_moc/assign_bullets/'+MODEL+'/'):
    os.makedirs(config.MAGMA_DIR+'datasets/karger_books_moc/assign_bullets/'+MODEL+'/')
    

In [15]:
df_cc_from_bull.to_csv(config.MAGMA_DIR+'datasets/karger_books_chunk_chapter/assign_bullets/'+MODEL+'/df.csv')
df_moc_from_bull.to_csv(config.MAGMA_DIR+'datasets/karger_books_moc/assign_bullets/'+MODEL+'df.csv')

#### Create train, test, validation (CC)

In [16]:
op = config.MAGMA_DIR+'datasets/karger_books_chunk_chapter/assign_bullets/'+MODEL+'/'
if not os.path.exists(op):
    os.makedirs(op)

In [17]:
df_cc_from_bull = df_cc_from_bull.groupby(level=[0, 1], sort=False).agg({
    'bullets': lambda b: list(b),
    'text': lambda t: list(t),
})

In [18]:
df_cc_from_bull = df_cc_from_bull.sample(frac=1, random_state=config.SEED)
df_cc_from_bull['num_bulls'] = df_cc_from_bull.bullets.map(len).cumsum()
tot_bulls = df_cc_from_bull.num_bulls.iloc[-1]
split1 = np.where(df_cc_from_bull.num_bulls > int(tot_bulls*0.8))[0][0]+1
split2 = np.where(df_cc_from_bull.num_bulls > int(tot_bulls*0.9))[0][0]+1
print(split1, split2)

361 408


In [19]:
train, val, test =\
    df_cc_from_bull.iloc[:split1].explode('bullets'),\
    df_cc_from_bull.iloc[split1:split2].explode('bullets'),\
    df_cc_from_bull.iloc[split2:].explode('bullets')

train['text'] = df_cc_from_bull.iloc[:split1].explode('text')['text']
val['text'] = df_cc_from_bull.iloc[split1:split2].explode('text')['text']
test['text'] = df_cc_from_bull.iloc[split2:].explode('text')['text']

train.to_csv(op+'train.csv')
val.to_csv(op+'val.csv')
test.to_csv(op+'test.csv')

In [20]:
with open(op+'train.source', 'w') as tr_s,\
    open(op+'train.target', 'w') as tr_t,\
    open(op+'train.index', 'w') as tr_i:
    for idx, row in train[['text', 'bullets']].iterrows():
        tr_i.write(str(idx) + '\n')
        tr_s.write(row.text + '\n')
        tr_t.write(row.bullets + '\n')

In [21]:
with open(op+'val.source', 'w') as va_s,\
    open(op+'val.target', 'w') as va_t,\
    open(op+'val.index', 'w') as va_i:
    for idx, row in val[['text', 'bullets']].iterrows():
        va_i.write(str(idx) + '\n')
        va_s.write(row.text + '\n')
        va_t.write(row.bullets + '\n')

In [22]:
with open(op+'test.source', 'w') as te_s,\
    open(op+'test.target', 'w') as te_t,\
    open(op+'test.index', 'w') as te_i:
    for idx, row in test[['text', 'bullets']].iterrows():
        te_i.write(str(idx) + '\n')
        te_s.write(row.text + '\n')
        te_t.write(row.bullets + '\n')

#### Create train, test, validation (MoC)

In [23]:
op = config.MAGMA_DIR+'datasets/karger_books_moc/assign_bullets/'+MODEL+'/'
if not os.path.exists(op):
    os.makedirs(op)

In [24]:
df_moc_from_bull = df_moc_from_bull.groupby(level=[0, 1], sort=False).agg({
    'bullets': lambda b: list(b),
    'text': lambda t: list(t),
})

In [25]:
df_moc_from_bull = df_moc_from_bull.sample(frac=1, random_state=config.SEED)
df_moc_from_bull['num_bulls'] = df_moc_from_bull.bullets.map(len).cumsum()
tot_bulls = df_moc_from_bull.num_bulls.iloc[-1]
split1 = np.where(df_moc_from_bull.num_bulls > int(tot_bulls*0.8))[0][0]+1
split2 = np.where(df_moc_from_bull.num_bulls > int(tot_bulls*0.9))[0][0]+1
print(split1, split2)

361 408


In [26]:
train, val, test =\
    df_moc_from_bull.iloc[:split1].explode('bullets'),\
    df_moc_from_bull.iloc[split1:split2].explode('bullets'),\
    df_moc_from_bull.iloc[split2:].explode('bullets')

train['text'] = df_moc_from_bull.iloc[:split1].explode('text')['text']
val['text'] = df_moc_from_bull.iloc[split1:split2].explode('text')['text']
test['text'] = df_moc_from_bull.iloc[split2:].explode('text')['text']

train.to_csv(op+'train.csv')
val.to_csv(op+'val.csv')
test.to_csv(op+'test.csv')

In [27]:
with open(op+'train.source', 'w') as tr_s,\
    open(op+'train.target', 'w') as tr_t,\
    open(op+'train.index', 'w') as tr_i:
    for idx, row in train[['text', 'bullets']].iterrows():
        tr_i.write(str(idx) + '\n')
        tr_s.write(row.text + '\n')
        tr_t.write(row.bullets + '\n')

In [28]:
with open(op+'val.source', 'w') as va_s,\
    open(op+'val.target', 'w') as va_t,\
    open(op+'val.index', 'w') as va_i:
    for idx, row in val[['text', 'bullets']].iterrows():
        va_i.write(str(idx) + '\n')
        va_s.write(row.text + '\n')
        va_t.write(row.bullets + '\n')

In [29]:
with open(op+'test.source', 'w') as te_s,\
    open(op+'test.target', 'w') as te_t,\
    open(op+'test.index', 'w') as te_i:
    for idx, row in test[['text', 'bullets']].iterrows():
        te_i.write(str(idx) + '\n')
        te_s.write(row.text + '\n')
        te_t.write(row.bullets + '\n')