### **Config**

In [1]:
import os
import sys

sys.path.insert(0, '/home/marco/epfl/magma/')
import config

In [2]:
MODEL = 't5'

RE_SPLITTER = '\n'              # do we split sentences of paragraphs?
                                # use '\.(?!\d)|\n' or '\n', respectively

# Output path
OUTPUT_PATH = config.MAGMA_DIR+'datasets/bullet_paragraph_embeddings/'+MODEL+'/'
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

### **Init**

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import re
import pandas as pd
import nltk
import gensim
from textwrap import fill
from tqdm import tqdm
tqdm.pandas()

if 'pegasus' in MODEL:
    from transformers import PegasusTokenizer
    tokenizer =\
        PegasusTokenizer.from_pretrained('google/pegasus-large')
elif 'bart' in MODEL:
    from transformers import BartTokenizer
    tokenizer =\
        BartTokenizer.from_pretrained('facebook/bart-large-cnn')
elif 't5' in MODEL:
    from transformers import T5Tokenizer
    tokenizer =\
        T5Tokenizer.from_pretrained('t5-large')

## **Karger Books Base Dataset**

In [4]:
base_dataset = config.MAGMA_DIR+'datasets/karger_books_base/df.csv'
df = pd.read_csv(base_dataset)
df = df.set_index(['book', 'chapter', 'section', 'subsection'])
df.bullets = df.bullets.map(eval, na_action='ignore')

## **Preprocessing**

#### Preprocessing

* Split based on RE_SPLITTER
* Explode the dataset
* Remove unwanted chars at beginning or end of sentence
* Remove multiple spaces
* Remove long words (> config.TOKEN_MAX_LEN chars)

In [5]:
# Split in sentences / paragraphs based on RE_SPLITTER
df.text =\
    df.text.map(lambda x: [p.strip() for p in re.split(RE_SPLITTER, x) if p!=''],
                na_action='ignore')
    
# explode to get one row for each paragraph /sentence
df = df.explode('text')
df = df.rename(columns={'text': 'para'})
df = df.dropna()

# Remove unwanted chars at beginning or end of sentence
df.para = df.para.map(lambda p: p.lstrip('.,;:-)] \n'))
df.para = df.para.map(lambda p: p.rstrip('.,;:-([ \n'))

# Remove multiple spaces
df.para = df.para.map(lambda p:
    re.sub('\s+', ' ', p).strip())

# Remove long words (> config.TOKEN_MAX_LEN chars)
def para2words(para):
    return gensim.utils.simple_preprocess(
        para, deacc=True, max_len=config.TOKEN_MAX_LEN)
df['para_proc'] = df.para.map(para2words)
df['bullets_proc'] = df.bullets.map(lambda bs: [para2words(b) for b in bs])

#### Further Preprocessing

* Remove stop words
* Remove short sentences / paragraphs (< config.PARA_MIN_LEN tokens)

In [6]:
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

df.para_proc = df.para_proc.map(lambda p:
    [w for w in p if w not in stop_words])
df.bullets_proc = df.bullets_proc.map(lambda bs:
    [[w for w in b if w not in stop_words] for b in bs])

[nltk_data] Downloading package stopwords to /home/marco/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Remove short sentences / paragraphs (< config.PARA_MIN_LEN tokens)
df.loc[df.para_proc.map(len) <\
    config.PARA_MIN_LEN, 'para_proc'] = np.nan

df = df.dropna()

In [8]:
df.para = df.para.map(lambda p: p+'.')

## **Assign Bullets to Best Para and Expand Functions**

In [9]:
def assign_best_metric_para(df, col_metric):
    df['best_match'] = False

    for idx, para  in df.groupby('bullets').progress_apply(
        lambda g: g.iloc[g[col_metric].argmax()]).para.iteritems():
        
        df.loc[\
            (df['bullets'] == idx) &\
            (df['para'] == para), 'best_match'] = True
    
    para_too_short =\
        df[(df['compression_ratio'] >= config.MAX_RATIO) & df['best_match']]
    print('Percentage of paragraphs which are too short to be summarized: %.2f %%'\
        %(len(para_too_short)/len(df[df['best_match']])*100))
    
    return df

In [10]:
def expand_up_down(df, col_metric):
    # for each bullet
    for bul in tqdm(set(df.bullets.tolist())):
        df_bul = df[df['bullets'] == bul]
        
        # get book and chapter where this bullet is
        book = df_bul.index.get_level_values(0)[0]
        cpt = df_bul.index.get_level_values(1)[0]

        df_bul = df_bul.reset_index()
        # get best match index
        best_match_idx = np.where(df_bul['best_match'])[0][0]
        merged_para_idx = [best_match_idx]

        bul_num_tok = df_bul.loc[best_match_idx, 'bullets_num_tokens']
        merged_para_num_tok = df_bul.loc[best_match_idx, 'para_num_tokens']
        comp_ratio = df_bul.loc[best_match_idx, 'compression_ratio']
        num_bul_cpt = len(set(df.loc[book, cpt].bullets.tolist()))
        max_idx = len(df_bul)-1
        
        while comp_ratio > config.MAX_RATIO and\
            merged_para_num_tok < tokenizer.model_max_length:
            
            # if we already merged all possible paragraphs
            if (0 in merged_para_idx) and (max_idx in merged_para_idx):
                break
                
            # if we already merged the first paragraph
            elif 0 in merged_para_idx:
                new_para_idx = max(merged_para_idx)+1
                
            # if we already merged the last paragraph
            elif max_idx in merged_para_idx:
                new_para_idx = min(merged_para_idx)-1
                
            # otherwise check for best metric inclusion
            else:
                if df_bul.loc[min(merged_para_idx)-1, col_metric] <\
                    df_bul.loc[max(merged_para_idx)+1, col_metric]:
                    # merge down
                    new_para_idx = max(merged_para_idx)+1
                    
                else: # merge up
                    new_para_idx = min(merged_para_idx)-1       

            df_bul.loc[new_para_idx, 'best_match'] = True
            merged_para_idx.append(new_para_idx)
            
            merged_para_num_tok += df_bul.loc[new_para_idx, 'para_num_tokens']
            comp_ratio = bul_num_tok / merged_para_num_tok

        for p, b in zip(df_bul.loc[merged_para_idx]['para'].tolist(),
            df_bul.loc[merged_para_idx]['bullets'].tolist()):
            df.loc[(df['para'] == p) &
                (df['bullets'] == b), 'best_match'] = True
        
    return df

In [11]:
def print_stats(df):
    num_para_tot = 18822
    num_para_kept = np.sum(df.groupby('para')['best_match'].apply(np.any).tolist())
    print('%d out of %d paragraphs are considered using this method.'%(num_para_kept, num_para_tot), end=' ')
    print('Thus, %.2f %%'%(100*num_para_kept/num_para_tot))
    
    print()
    df_count_tokens = df.groupby('para', sort=False).agg({
        'best_match': lambda bm: np.any(list(bm)),
        'para_num_tokens': lambda pnt: list(pnt)[0]})
    num_tok_kept = df_count_tokens[df_count_tokens['best_match']].para_num_tokens.sum()
    num_tok_tot = df_count_tokens.para_num_tokens.sum()

    print('%d out of %d tokens are considered using this method.'%(num_tok_kept, num_tok_tot), end=' ')
    print('Thus, %.2f %%'%(100*num_tok_kept/num_tok_tot))

def print_stats_after_merge(df):
    para_too_short = df[df['compression_ratio'] > config.MAX_RATIO]
    print('Percentage of paragraphs which are too short to be summarized: %.2f %%'\
        %(len(para_too_short)/len(df)*100))
    
    print()
    print('Paragraphs which are too long to fit into the model: %d paragraphs.'%\
          len(df[df['para_num_tokens'] > tokenizer.model_max_length]))
    print(df[df['para_num_tokens'] > tokenizer.model_max_length])

## **Word2Vec Book Level**

#### Create Word Vectors

In [12]:
op = OUTPUT_PATH + 'w2v/'
if not os.path.exists(op):
    os.makedirs(op)

In [13]:
df_w2v = df.copy()

In [14]:
df_w2v_book = df_w2v.groupby('book', sort=False).agg({
    'para_proc': lambda pp: list(pp),
    'bullets_proc': lambda bp: list(bp)[0]
})

In [15]:
df_w2v_book['corpus'] = df_w2v_book.para_proc + df_w2v_book.bullets_proc

In [16]:
df_w2v_book['w2v'] = df_w2v_book.corpus.progress_map(lambda c:\
    gensim.models.Word2Vec(
        c,
        #size=128,
        #window=3,
        min_count=1,
        sg=1, # 1 for skip-gram; otherwise CBOW.
        seed = config.SEED))

100%|██████████| 53/53 [00:39<00:00,  1.34it/s]


In [17]:
def assign_word_vectors(r, col):
    book = r.name[0]
    wv = df_w2v_book.loc[book, 'w2v'].wv
    wv_list = []
    for x in r[col]:
        try:
            v = wv[x]
        except:
            continue
        wv_list.append(v)
    return wv_list

df_w2v['para_wv'] = df_w2v.progress_apply(lambda row: assign_word_vectors(row, 'para_proc'), axis=1)

# taking the average of the w2v vector of each paragraph
df_w2v.para_wv = df_w2v.para_wv.progress_map(lambda p_wv: np.mean(p_wv, axis=0))

100%|██████████| 18773/18773 [00:01<00:00, 12560.17it/s]
100%|██████████| 18773/18773 [00:00<00:00, 31126.74it/s]


In [18]:
print(df_w2v[df_w2v.para_wv.isna()])
df_w2v = df_w2v.dropna()

Empty DataFrame
Columns: [para, bullets, para_proc, bullets_proc, para_wv]
Index: []


#### Explode, preprocess, w2v bullets

In [19]:
df_w2v = df_w2v.explode('bullets')

df_w2v['bullets_proc'] = df_w2v.bullets.progress_map(para2words)
df_w2v.bullets_proc = df_w2v.bullets_proc.progress_map(lambda b:
    [w for w in b if w not in stop_words])

100%|██████████| 114277/114277 [00:07<00:00, 14715.48it/s]
100%|██████████| 114277/114277 [00:05<00:00, 22594.26it/s]


In [20]:
df_w2v['bullets_wv'] = df_w2v.progress_apply(lambda row: assign_word_vectors(row, 'bullets_proc'), axis=1)

# taking the average of the w2v vector of each bullet
df_w2v.bullets_wv = df_w2v.bullets_wv.progress_map(lambda b_wv: np.mean(b_wv, axis=0))

100%|██████████| 114277/114277 [00:08<00:00, 14144.41it/s]
100%|██████████| 114277/114277 [00:02<00:00, 39819.31it/s]


In [21]:
df_w2v['para_num_tokens'] = df_w2v.para.progress_map(lambda p: len(tokenizer.tokenize(p)))
df_w2v['bullets_num_tokens'] = df_w2v.bullets.progress_map(lambda b: len(tokenizer.tokenize(b)))

df_w2v['compression_ratio'] = df_w2v.bullets_num_tokens / df_w2v.para_num_tokens

100%|██████████| 114277/114277 [00:50<00:00, 2284.22it/s]
100%|██████████| 114277/114277 [00:40<00:00, 2822.96it/s]


#### Calculate cosine similarity between each couple bullet-para

In [22]:
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a)*np.linalg.norm(b))

In [23]:
df_w2v['cosine_sim'] = df_w2v[['para_wv', 'bullets_wv']].progress_apply(lambda row:\
    cosine_sim(row[0], row[1]), axis=1)

100%|██████████| 114277/114277 [00:02<00:00, 40348.92it/s]


#### Find Best Match and Expand

In [24]:
# find best match bullet-para for each bullet
df_w2v = assign_best_metric_para(df_w2v, 'cosine_sim')

100%|██████████| 2556/2556 [00:01<00:00, 2111.63it/s]


Percentage of paragraphs which are too short to be summarized: 55.79 %


In [25]:
df_w2v = expand_up_down(df_w2v, 'cosine_sim')

100%|██████████| 2556/2556 [01:39<00:00, 25.57it/s]


In [26]:
print_stats(df_w2v)

4022 out of 18822 paragraphs are considered using this method. Thus, 21.37 %

423472 out of 1493612 tokens are considered using this method. Thus, 28.35 %


In [27]:
df_w2v_merge = df_w2v[df_w2v['best_match']].reset_index().groupby(['book', 'chapter', 'bullets'], sort=False)\
.agg({
    'para': lambda p: ' '.join(list(p)),
    'para_num_tokens': sum,
    'bullets_num_tokens': lambda bnt: list(bnt)[0]
}).reset_index(level='bullets')
df_w2v_merge = df_w2v_merge.rename(columns={'para': 'text'})

df_w2v_merge['compression_ratio'] = df_w2v_merge.bullets_num_tokens / df_w2v_merge.para_num_tokens

In [28]:
print_stats_after_merge(df_w2v_merge)

Percentage of paragraphs which are too short to be summarized: 0.31 %

Paragraphs which are too long to fit into the model: 25 paragraphs.
                                                                 bullets  \
book          chapter                                                      
9781908541024 ch_6     Concurrent and adjuvant temozolomide chemother...   
              ch_6     Observation may be an appropriate initial stra...   
9781908541062 ch_10    Nine biological therapies are now licensed for...   
9781908541086 ch_5     Clinicians and patients are best served by con...   
              ch_11    Eating disorders not otherwise specified is th...   
9781908541727 ch07     Urgently refer patients who present with any o...   
              ch09     Refer urgently (the same day), patients with o...   
              ch10     Refer urgently, to be seen the same day, if yo...   
              ch10     Refer within 1 week, any child with:- squint (...   
              ch11     Re

In [29]:
df_w2v_merge[df_w2v_merge['para_num_tokens'] > tokenizer.model_max_length].para_num_tokens.describe()

count     25.000000
mean     561.680000
std       45.329093
min      514.000000
25%      525.000000
50%      546.000000
75%      584.000000
max      684.000000
Name: para_num_tokens, dtype: float64

#### Save dataset

In [30]:
df_w2v_merge.to_csv(op+'df.csv')

#### Create train, test, validation

In [31]:
df_w2v_merge = df_w2v_merge.groupby(level=[0, 1], sort=False).agg({
    'bullets': lambda b: list(b),
    'text': lambda t: list(t),
})

In [32]:
df_w2v_merge = df_w2v_merge.sample(frac=1, random_state=config.SEED)
df_w2v_merge['num_bulls'] = df_w2v_merge.bullets.map(len).cumsum()
tot_bulls = df_w2v_merge.num_bulls.iloc[-1]
split1 = np.where(df_w2v_merge.num_bulls > int(tot_bulls*0.8))[0][0]+1
split2 = np.where(df_w2v_merge.num_bulls > int(tot_bulls*0.9))[0][0]+1
print(split1, split2)

361 408


In [33]:
train, val, test =\
    df_w2v_merge.iloc[:split1].explode('bullets'),\
    df_w2v_merge.iloc[split1:split2].explode('bullets'),\
    df_w2v_merge.iloc[split2:].explode('bullets')

train['text'] = df_w2v_merge.iloc[:split1].explode('text')['text']
val['text'] = df_w2v_merge.iloc[split1:split2].explode('text')['text']
test['text'] = df_w2v_merge.iloc[split2:].explode('text')['text']

train.to_csv(op+'train.csv')
val.to_csv(op+'val.csv')
test.to_csv(op+'test.csv')

In [34]:
with open(op+'train.source', 'w') as tr_s,\
    open(op+'train.target', 'w') as tr_t,\
    open(op+'train.index', 'w') as tr_i:
    for idx, row in train[['text', 'bullets']].iterrows():
        tr_i.write(str(idx) + '\n')
        tr_s.write(row.text + '\n')
        tr_t.write(row.bullets + '\n')
        
with open(op+'val.source', 'w') as va_s,\
    open(op+'val.target', 'w') as va_t,\
    open(op+'val.index', 'w') as va_i:
    for idx, row in val[['text', 'bullets']].iterrows():
        va_i.write(str(idx) + '\n')
        va_s.write(row.text + '\n')
        va_t.write(row.bullets + '\n')
        
with open(op+'test.source', 'w') as te_s,\
    open(op+'test.target', 'w') as te_t,\
    open(op+'test.index', 'w') as te_i:
    for idx, row in test[['text', 'bullets']].iterrows():
        te_i.write(str(idx) + '\n')
        te_s.write(row.text + '\n')
        te_t.write(row.bullets + '\n')

## **Doc2Vec Book Level**

#### Create Doc Vectors

In [35]:
op = OUTPUT_PATH + 'd2v/'
if not os.path.exists(op):
    os.makedirs(op)

In [36]:
df_d2v = df.copy()

In [37]:
df_d2v_book = df_d2v.groupby('book', sort=False).agg({
    'para_proc': lambda pp: list(pp),
    'bullets_proc': lambda bp: list(bp)[0]
})

In [38]:
df_d2v_book['corpus'] = df_d2v_book.para_proc + df_d2v_book.bullets_proc
df_d2v_book['tagged_corpus'] = df_d2v_book.corpus.map(lambda c:
    [gensim.models.doc2vec.TaggedDocument(para, [i]) for i, para in enumerate(c)])

In [39]:
df_d2v_book['d2v'] = df_d2v_book.tagged_corpus.progress_map(lambda tc:\
    gensim.models.Doc2Vec(
        tc,
        dm=1, # 1 for PV-DM; otherwise PV-DBOW
        #vector_size=128,
        #window=3,
        #epochs=5,
        min_count=1,
        seed = config.SEED))

100%|██████████| 53/53 [00:40<00:00,  1.31it/s]


#### Explode and preprocess bullets

In [40]:
df_d2v = df_d2v.explode('bullets')

df_d2v['bullets_proc'] = df_d2v.bullets.progress_map(para2words)
df_d2v.bullets_proc = df_d2v.bullets_proc.progress_map(lambda b:
    [w for w in b if w not in stop_words])

100%|██████████| 114277/114277 [00:07<00:00, 16026.14it/s]
100%|██████████| 114277/114277 [00:04<00:00, 26443.51it/s]


In [41]:
df_d2v['para_num_tokens'] = df_d2v.para.progress_map(lambda p: len(tokenizer.tokenize(p)))
df_d2v['bullets_num_tokens'] = df_d2v.bullets.progress_map(lambda b: len(tokenizer.tokenize(b)))

df_d2v['compression_ratio'] = df_d2v.bullets_num_tokens / df_d2v.para_num_tokens

100%|██████████| 114277/114277 [00:40<00:00, 2819.05it/s]
100%|██████████| 114277/114277 [00:40<00:00, 2826.34it/s]


#### Calculate similarity between each couple bullet-para

In [42]:
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a)*np.linalg.norm(b))

def d2v_similarity(r):
    book = r.name[0]
    d2v = df_d2v_book.loc[book, 'd2v']
    dv_para = d2v.infer_vector(r.para_proc)
    dv_bullets = d2v.infer_vector(r.bullets_proc)
    
    return cosine_sim(dv_para, dv_bullets)
    
df_d2v['d2v_sim'] = df_d2v.progress_apply(lambda row: d2v_similarity(row), axis=1)

100%|██████████| 114277/114277 [02:29<00:00, 766.60it/s]


#### Find Best Match and Expand

In [43]:
# find best match bullet-para for each bullet
df_d2v = assign_best_metric_para(df_d2v, 'd2v_sim')

100%|██████████| 2556/2556 [00:01<00:00, 1292.53it/s]


Percentage of paragraphs which are too short to be summarized: 55.71 %


In [44]:
df_d2v_expand = expand_up_down(df_d2v, 'd2v_sim')

100%|██████████| 2556/2556 [01:31<00:00, 28.06it/s]


In [45]:
print_stats(df_d2v)

4023 out of 18822 paragraphs are considered using this method. Thus, 21.37 %

421842 out of 1493612 tokens are considered using this method. Thus, 28.24 %


In [46]:
df_d2v_merge = df_d2v[df_d2v['best_match']].reset_index().groupby(['book', 'chapter', 'bullets'], sort=False)\
.agg({
    'para': lambda p: ' '.join(list(p)),
    'para_num_tokens': sum,
    'bullets_num_tokens': lambda bnt: list(bnt)[0]
}).reset_index(level='bullets')
df_d2v_merge = df_d2v_merge.rename(columns={'para': 'text'})

df_d2v_merge['compression_ratio'] = df_d2v_merge.bullets_num_tokens / df_d2v_merge.para_num_tokens

In [47]:
print_stats_after_merge(df_d2v_merge)

Percentage of paragraphs which are too short to be summarized: 0.35 %

Paragraphs which are too long to fit into the model: 29 paragraphs.
                                                                 bullets  \
book          chapter                                                      
9781908541024 ch_6     Concurrent and adjuvant temozolomide chemother...   
              ch_6     Observation may be an appropriate initial stra...   
9781908541277 ch_11    Oral contraceptives containing at least 50 µg ...   
9781908541406 ch_4     Specific inquiry should be made about oral ant...   
              ch_15    The direct factor Xa and thrombin inhibitors c...   
9781908541680 ch_6     Healthy diets focus on fruit and vegetables, w...   
9781908541727 ch07     Urgently refer patients who present with any o...   
              ch09     Refer urgently (the same day), patients with o...   
              ch10     Refer urgently, to be seen the same day, if yo...   
              ch10     Re

#### Save dataset

In [48]:
df_d2v_merge.to_csv(op+'df.csv')

#### Create train, test, validation

In [49]:
df_d2v_merge = df_d2v_merge.groupby(level=[0, 1], sort=False).agg({
    'bullets': lambda b: list(b),
    'text': lambda t: list(t),
})

In [50]:
df_d2v_merge = df_d2v_merge.sample(frac=1, random_state=config.SEED)
df_d2v_merge['num_bulls'] = df_d2v_merge.bullets.map(len).cumsum()
tot_bulls = df_d2v_merge.num_bulls.iloc[-1]
split1 = np.where(df_d2v_merge.num_bulls > int(tot_bulls*0.8))[0][0]+1
split2 = np.where(df_d2v_merge.num_bulls > int(tot_bulls*0.9))[0][0]+1
print(split1, split2)

361 408


In [51]:
train, val, test =\
    df_d2v_merge.iloc[:split1].explode('bullets'),\
    df_d2v_merge.iloc[split1:split2].explode('bullets'),\
    df_d2v_merge.iloc[split2:].explode('bullets')

train['text'] = df_d2v_merge.iloc[:split1].explode('text')['text']
val['text'] = df_d2v_merge.iloc[split1:split2].explode('text')['text']
test['text'] = df_d2v_merge.iloc[split2:].explode('text')['text']

train.to_csv(op+'train.csv')
val.to_csv(op+'val.csv')
test.to_csv(op+'test.csv')

In [52]:
with open(op+'train.source', 'w') as tr_s,\
    open(op+'train.target', 'w') as tr_t,\
    open(op+'train.index', 'w') as tr_i:
    for idx, row in train[['text', 'bullets']].iterrows():
        tr_i.write(str(idx) + '\n')
        tr_s.write(row.text + '\n')
        tr_t.write(row.bullets + '\n')
        
with open(op+'val.source', 'w') as va_s,\
    open(op+'val.target', 'w') as va_t,\
    open(op+'val.index', 'w') as va_i:
    for idx, row in val[['text', 'bullets']].iterrows():
        va_i.write(str(idx) + '\n')
        va_s.write(row.text + '\n')
        va_t.write(row.bullets + '\n')
        
with open(op+'test.source', 'w') as te_s,\
    open(op+'test.target', 'w') as te_t,\
    open(op+'test.index', 'w') as te_i:
    for idx, row in test[['text', 'bullets']].iterrows():
        te_i.write(str(idx) + '\n')
        te_s.write(row.text + '\n')
        te_t.write(row.bullets + '\n')

## **Sentence-Transformers Book Level**

In [53]:
op = OUTPUT_PATH + 'st/'
if not os.path.exists(op):
    os.makedirs(op)

In [54]:
df_st = df.copy()

#### Create embedding vectors for para

In [55]:
from sentence_transformers import SentenceTransformer

# might want to try 'msmarco-distilbert-base-v2' too
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

df_st['para_enc'] = df_st.para.progress_map(model.encode)

100%|██████████| 18773/18773 [24:22<00:00, 12.84it/s]


#### Explode bullets

In [56]:
df_st = df_st.explode('bullets')

In [57]:
df_st['para_num_tokens'] = df_st.para.progress_map(lambda p: len(tokenizer.tokenize(p)))
df_st['bullets_num_tokens'] = df_st.bullets.progress_map(lambda b: len(tokenizer.tokenize(b)))

df_st['compression_ratio'] = df_st.bullets_num_tokens / df_st.para_num_tokens

100%|██████████| 114277/114277 [00:43<00:00, 2637.41it/s]
100%|██████████| 114277/114277 [00:33<00:00, 3366.45it/s]


#### Create embedding vectors for bullets

In [58]:
bull_to_embed = df_st.groupby(['book', 'chapter'], sort=False).agg({
    'bullets': lambda b: list(set(b))
}).explode('bullets')

bull_to_embed['bullets_enc'] = bull_to_embed.bullets.progress_map(model.encode)

100%|██████████| 2556/2556 [01:32<00:00, 27.75it/s]


#### Calculate similarity between each couple bullet-para

In [59]:
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a)*np.linalg.norm(b))

def sentence_transformers_sim(r):
    book = r.name[0]
    b2e = bull_to_embed.loc[book]
    para_enc = r.para_enc
    bullets_enc = b2e.loc[(b2e.bullets == r.bullets), 'bullets_enc']
    assert len(bullets_enc) == 1
    bullets_enc = bullets_enc[0]
    
    return cosine_sim(para_enc, bullets_enc)
    
df_st['st_sim'] = df_st.progress_apply(sentence_transformers_sim, axis=1)

100%|██████████| 114277/114277 [01:34<00:00, 1203.15it/s]


### Find Best Match and Expand

In [60]:
# find best match bullet-para for each bullet
df_st = assign_best_metric_para(df_st, 'st_sim')

100%|██████████| 2556/2556 [00:01<00:00, 2190.16it/s]


Percentage of paragraphs which are too short to be summarized: 64.01 %


In [61]:
df_st_base = expand_up_down(df_st.copy(), 'st_sim')

100%|██████████| 2556/2556 [01:41<00:00, 25.09it/s]


In [62]:
print_stats(df_st_base)

4560 out of 18822 paragraphs are considered using this method. Thus, 24.23 %

430587 out of 1493612 tokens are considered using this method. Thus, 28.83 %


### Study Overlaps

##### Functions

In [63]:
def create_overlap_matrix(r):
    num_bulls = len(r.selected_para)
    #assert num_bulls == len(r.bullets)
    overlap_matrix = np.zeros((num_bulls,num_bulls))
    
    def list_overlap(a, b):
        return list( set(a).intersection(set(b)) )
    
    for i in range(num_bulls):
        for j in range(num_bulls):
            if i == j : continue
            num_tok_i = np.sum(r.para_num_tokens[r.selected_para[i]])
            overlap = list_overlap(
                r.selected_para[i], r.selected_para[j])
            num_tok_overlap = np.sum(r.para_num_tokens[overlap])
            assert num_tok_overlap <= num_tok_i
            
            overlap_matrix[i, j] = round(num_tok_overlap/num_tok_i*100, 2)
    
    return overlap_matrix

def find_big_overlap(r, threshold):
    om = r.overlap_matrix
    big_overlap_idx = np.argwhere(om >= threshold)
    big_overlap_idx = set([frozenset(t) for t in big_overlap_idx])
    merged = set()
    to_be_merged = set()
    for idx in big_overlap_idx:
        idx = tuple(idx)
        i, j = idx[0], idx[1]
        if i not in merged and j not in merged:
            to_be_merged.add(idx)
            merged.add(i)
            merged.add(j)
    return to_be_merged

def merge_bullets(r):
    for i, j in r.to_be_merged:
        r.bullets[i] += (' '+r.bullets[j])
        r.bullets_num_tokens[i] += r.bullets_num_tokens[j]
        r.selected_para[i] = np.array(list(set(
            np.concatenate((r.selected_para[i], r.selected_para[j])))))
        
        r.bullets[j] = None
        r.bullets_num_tokens[j] = None
        r.selected_para[j] = None
    r.bullets = [b for b in r.bullets if b is not None]
    r.bullets_num_tokens = [bnt for bnt in r.bullets_num_tokens if bnt is not None]
    r.selected_para = [sp for sp in r.selected_para if sp is not None]

##### Base

In [64]:
df_st_base_study = df_st_base.groupby(['book', 'chapter', 'bullets'], sort=False).agg({
    'para': lambda p: list(p),
    'para_num_tokens': lambda pnt: list(pnt),
    'bullets_num_tokens': lambda bnt: list(bnt)[0],
    'best_match': lambda bm: list(bm)
}).reset_index('bullets')
df_st_base_study.best_match = df_st_base_study.best_match.map(lambda bm: np.where(bm)[0])
df_st_base_study = df_st_base_study.groupby(['book', 'chapter'], sort=False).agg({
    'bullets': lambda b: list(b),
    'para': lambda p: list(p)[0],
    'para_num_tokens': lambda pnt: list(pnt)[0],
    'bullets_num_tokens': lambda bnt: list(bnt),
    'best_match': lambda bm: list(bm)
}).rename(columns={'best_match': 'selected_para'})
df_st_base_study.para_num_tokens = df_st_base_study.para_num_tokens.map(np.array)

In [65]:
while True:
    print('\nTotal number of bullets: %d'%(df_st_base_study.bullets.map(len).sum()))
    df_st_base_study['overlap_matrix'] = df_st_base_study.apply(create_overlap_matrix, axis=1)

    df_st_base_study['to_be_merged'] = df_st_base_study.apply(lambda row: find_big_overlap(row, 90), axis=1)

    num_to_be_merged = df_st_base_study.to_be_merged.map(len).sum()
    print('Bullets to be merged: %d'%num_to_be_merged)
    if (num_to_be_merged <= 0) : break

    df_st_base_study.apply(merge_bullets, axis=1)


Total number of bullets: 2556
Bullets to be merged: 407

Total number of bullets: 2149
Bullets to be merged: 67

Total number of bullets: 2082
Bullets to be merged: 6

Total number of bullets: 2076
Bullets to be merged: 2

Total number of bullets: 2074
Bullets to be merged: 0


### Putting Things Together

##### Base

In [66]:
df_st_base_save =\
    df_st_base[df_st_base['best_match']].reset_index().groupby(['book', 'chapter', 'bullets'], sort=False)\
    .agg({
        'para': lambda p: ' '.join(list(p)),
        'para_num_tokens': sum,
        'bullets_num_tokens': lambda bnt: list(bnt)[0]
    }).reset_index(level='bullets')
df_st_base_save = df_st_base_save.rename(columns={'para': 'text'})

df_st_base_save['compression_ratio'] = df_st_base_save.bullets_num_tokens / df_st_base_save.para_num_tokens

In [67]:
df_st_base_save.bullets_num_tokens.describe()

count    2556.000000
mean       35.391236
std        20.472680
min         3.000000
25%        22.000000
50%        31.000000
75%        44.000000
max       280.000000
Name: bullets_num_tokens, dtype: float64

In [68]:
df_st_base_save.para_num_tokens.describe()

count    2556.000000
mean      206.664710
std        93.792847
min        32.000000
25%       139.000000
50%       190.000000
75%       259.000000
max       647.000000
Name: para_num_tokens, dtype: float64

In [69]:
print_stats_after_merge(df_st_base_save)

Percentage of paragraphs which are too short to be summarized: 0.27 %

Paragraphs which are too long to fit into the model: 26 paragraphs.
                                                                 bullets  \
book          chapter                                                      
9781908541024 ch_6     Concurrent and adjuvant temozolomide chemother...   
              ch_6     Observation may be an appropriate initial stra...   
9781908541086 ch_11    Eating disorders not otherwise specified is th...   
9781908541277 ch_11    Oral contraceptives containing at least 50 µg ...   
9781908541420 ch_6     Drugs used in the management of asthma can be ...   
9781908541727 ch07     Urgently refer patients who present with any o...   
              ch09     Refer urgently (the same day), patients with o...   
              ch10     Refer urgently, to be seen the same day, if yo...   
              ch10     Refer within 1 week, any child with:- squint (...   
              ch11     Re

##### Merged Overlaps

In [70]:
#df_st_merge_no_overlap = 
df_st_base_study['bp'] = df_st_base_study.apply(lambda r:\
    [(b, sp, bnt) for b, sp, bnt in zip(r.bullets, r.selected_para, r.bullets_num_tokens)], axis=1)
df_st_merged_overlaps = df_st_base_study.explode('bp')
df_st_merged_overlaps.bullets = df_st_merged_overlaps.bp.map(lambda t: t[0])
df_st_merged_overlaps.selected_para = df_st_merged_overlaps.bp.map(lambda t: t[1])
df_st_merged_overlaps.bullets_num_tokens = df_st_merged_overlaps.bp.map(lambda t: t[2])

df_st_merged_overlaps['para'] = df_st_merged_overlaps.apply(lambda row:\
    ' '.join([p for i, p in enumerate(row.para) if i in row.selected_para]), axis=1)

df_st_merged_overlaps['para_num_tokens'] = df_st_merged_overlaps.apply(lambda row:\
    sum([p for i, p in enumerate(row.para_num_tokens) if i in row.selected_para]), axis=1)

df_st_merged_overlaps = df_st_merged_overlaps.drop(
    columns=['overlap_matrix', 'to_be_merged', 'bp', 'selected_para']).rename(columns={'para': 'text'})

df_st_merged_overlaps['compression_ratio'] =\
    df_st_merged_overlaps.bullets_num_tokens / df_st_merged_overlaps.para_num_tokens

In [71]:
df_st_merged_overlaps.bullets_num_tokens.describe()

count    2074.000000
mean       43.616201
std        31.358658
min         3.000000
25%        24.000000
50%        35.000000
75%        53.000000
max       341.000000
Name: bullets_num_tokens, dtype: float64

In [72]:
df_st_merged_overlaps.para_num_tokens.describe()

count    2074.000000
mean      214.580521
std        95.999375
min        32.000000
25%       144.000000
50%       197.000000
75%       267.000000
max       647.000000
Name: para_num_tokens, dtype: float64

In [73]:
df_st_merged_overlaps[df_st_merged_overlaps['compression_ratio'] > config.MAX_RATIO]

Unnamed: 0_level_0,Unnamed: 1_level_0,bullets,text,para_num_tokens,bullets_num_tokens,compression_ratio
book,chapter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9781905832729,ch_4,The surgical procedure of choice in women is i...,insertion of an artificial urinary sphincter. ...,218,86,0.394495
9781905832729,ch_7,Blood in the urine can originate from anywhere...,Hematuria can originate from anywhere along th...,112,39,0.348214
9781905832729,ch_8,Recurrent urinary tract infection (UTI) is def...,Recurrent urinary tract infection. Recurrent U...,223,78,0.349776
9781905832729,ch_8,Urinalysis is quick and easy. A positive test ...,"Diagnosis. Urinalysis, using dipstick tests fo...",162,54,0.333333
9781905832729,ch_9,"It is a relatively common condition, but gener...",Pharmacological treatment. Antimuscarinic drug...,245,64,0.261224
...,...,...,...,...,...,...
9783318067095,ch9,Validation ensures that the technology is meas...,Once you have arrived at a construct to measur...,529,240,0.453686
9783318068207,hh-6,A basket trial is a biomarker-driven study in ...,"Traditionally, oncology Phase I clinical trial...",200,71,0.355000
9783318068207,hh-7,"The molecular status of, at least, EGFR, ALK, ...",NGS testing for predictive biomarkers. EGFR ac...,402,113,0.281095
9783318068207,hh-7,PD-L1 expression also needs to be tested in pa...,Programmed death-ligand 1. In addition to the ...,113,43,0.380531


In [74]:
print_stats_after_merge(df_st_merged_overlaps)

Percentage of paragraphs which are too short to be summarized: 14.27 %

Paragraphs which are too long to fit into the model: 25 paragraphs.
                                                                 bullets  \
book          chapter                                                      
9781908541024 ch_6     Concurrent and adjuvant temozolomide chemother...   
9781908541086 ch_11    Eating disorders not otherwise specified is th...   
9781908541277 ch_11    Oral contraceptives containing at least 50 µg ...   
9781908541420 ch_6     Drugs used in the management of asthma can be ...   
9781908541727 ch07     Urgently refer patients who present with any o...   
              ch09     Refer urgently (the same day), patients with o...   
              ch10     Refer within 1 week, any child with:- squint (...   
              ch10     - an infant or child with photophobia, constan...   
              ch11     Refer urgently, to be seen by an ophthalmologi...   
9781910797211 ch02     A

### Save dataset

In [75]:
df_st_base_selected_para = df_st_base.groupby(['book', 'chapter', 'para'], sort=False).agg({
    'best_match': lambda b: np.any(list(b))
}).reset_index('para').groupby(['book', 'chapter'], sort=False).agg({
    'para': lambda p: list(p),
    'best_match': lambda b: list(b)
})
df_st_base_selected_para.best_match = df_st_base_selected_para.best_match.map(lambda b: list(np.where(b)[0]))
df_st_base_selected_para.to_csv(op+'df_base_selected_para.csv')

In [76]:
for d, df in zip(
    ['base', 'merged_overlaps'],
    [df_st_base_save, df_st_merged_overlaps]):

    op = OUTPUT_PATH + 'st/'+d+'/'
    if not os.path.exists(op):
        os.makedirs(op)
        
    df.to_csv(op+'df.csv')
        
    df = df.groupby(level=[0, 1], sort=False).agg({
        'bullets': lambda b: list(b),
        'text': lambda t: list(t),
    })
    
    df = df.sample(frac=1, random_state=config.SEED)
    df['num_bulls'] = df.bullets.map(len).cumsum()
    tot_bulls = df.num_bulls.iloc[-1]
    split1 = np.where(df.num_bulls > int(tot_bulls*0.8))[0][0]+1
    split2 = np.where(df.num_bulls > int(tot_bulls*0.9))[0][0]+1
    print(split1, split2)
    
    train, val, test =\
    df.iloc[:split1].explode('bullets'),\
    df.iloc[split1:split2].explode('bullets'),\
    df.iloc[split2:].explode('bullets')

    train['text'] = df.iloc[:split1].explode('text')['text']
    val['text'] = df.iloc[split1:split2].explode('text')['text']
    test['text'] = df.iloc[split2:].explode('text')['text']

    train.to_csv(op+'train.csv')
    val.to_csv(op+'val.csv')
    test.to_csv(op+'test.csv')
    
    with open(op+'train.source', 'w') as tr_s,\
        open(op+'train.target', 'w') as tr_t,\
        open(op+'train.index', 'w') as tr_i:
        for idx, row in train[['text', 'bullets']].iterrows():
            tr_i.write(str(idx) + '\n')
            tr_s.write(row.text + '\n')
            tr_t.write(row.bullets + '\n')
        
    with open(op+'val.source', 'w') as va_s,\
        open(op+'val.target', 'w') as va_t,\
        open(op+'val.index', 'w') as va_i:
        for idx, row in val[['text', 'bullets']].iterrows():
            va_i.write(str(idx) + '\n')
            va_s.write(row.text + '\n')
            va_t.write(row.bullets + '\n')

    with open(op+'test.source', 'w') as te_s,\
        open(op+'test.target', 'w') as te_t,\
        open(op+'test.index', 'w') as te_i:
        for idx, row in test[['text', 'bullets']].iterrows():
            te_i.write(str(idx) + '\n')
            te_s.write(row.text + '\n')
            te_t.write(row.bullets + '\n')

361 408
363 408


In [77]:
for d, df in zip(
    ['base', 'merged_overlaps'],
    [df_st_base_save, df_st_merged_overlaps]):

    op = OUTPUT_PATH + 'st/'+d+'/bybook/'
    if not os.path.exists(op):
        os.makedirs(op)
        
    df.to_csv(op+'df.csv')
        
    df = df.groupby(level=[0, 1], sort=False).agg({
        'bullets': lambda b: list(b),
        'text': lambda t: list(t),
    })
    
    df = df.sample(frac=1, random_state=config.SEED)
    
    book_ids = list(set(df.index.get_level_values(0)))
    train = df.loc[book_ids[:int(0.8*len(book_ids))]].explode('bullets')
    val = df.loc[book_ids[int(0.8*len(book_ids)):int(0.9*len(book_ids))]].explode('bullets')
    test = df.loc[book_ids[int(0.9*len(book_ids)):]].explode('bullets')
    
    train.text = df.loc[book_ids[:int(0.8*len(book_ids))]].explode('text').text
    val.text = df.loc[book_ids[int(0.8*len(book_ids)):int(0.9*len(book_ids))]].explode('text').text
    test.text = df.loc[book_ids[int(0.9*len(book_ids)):]].explode('text').text
    print(len(train), len(val), len(test))
    
    train.to_csv(op+'train.csv')
    val.to_csv(op+'val.csv')
    test.to_csv(op+'test.csv')
    
    with open(op+'train.source', 'w') as tr_s,\
        open(op+'train.target', 'w') as tr_t,\
        open(op+'train.index', 'w') as tr_i:
        for idx, row in train[['text', 'bullets']].iterrows():
            tr_i.write(str(idx) + '\n')
            tr_s.write(row.text + '\n')
            tr_t.write(row.bullets + '\n')
        
    with open(op+'val.source', 'w') as va_s,\
        open(op+'val.target', 'w') as va_t,\
        open(op+'val.index', 'w') as va_i:
        for idx, row in val[['text', 'bullets']].iterrows():
            va_i.write(str(idx) + '\n')
            va_s.write(row.text + '\n')
            va_t.write(row.bullets + '\n')

    with open(op+'test.source', 'w') as te_s,\
        open(op+'test.target', 'w') as te_t,\
        open(op+'test.index', 'w') as te_i:
        for idx, row in test[['text', 'bullets']].iterrows():
            te_i.write(str(idx) + '\n')
            te_s.write(row.text + '\n')
            te_t.write(row.bullets + '\n')

2089 195 272
1700 150 224


## **Print Some Examples**

In [78]:
def nice_print(idx, bull, list_text, list_text_num_tok, list_method):
    print(idx)
    print()
    print('Bullet:')
    print(fill(bull, 100))
    print()
    for t, tok, m in zip(list_text, list_text_num_tok, list_method):
        print(m+' (' +str(tok)+'):')
        print(fill(t, 100))
        print()
    
    print(''.join(['#']*100))
    print()

### Sentence-Transformer vs Rouge

In [79]:
df_st = pd.read_csv(OUTPUT_PATH+'st/base/df.csv').set_index(['book', 'chapter'])
df_rouge = pd.read_csv(config.MAGMA_DIR+'datasets/bullet_paragraph_rouge/'+MODEL+'/df.csv')\
    .set_index(['book', 'chapter'])

In [80]:
import random

random.seed(config.SEED)

bullet_examples = random.sample(df_st.bullets.tolist(), 20)
print(bullet_examples)

['Hypertension is both an important cause and a consequence of renal disease.', 'Neuronal tumors are uncommon brain neoplasms typically diagnosed in children and young adults.', 'The prevention of nausea has been much less successful with currently approved agents.', 'Lung transplantation in patients with very advanced COPD improves health status and functional capacity, though it does not convey a survival benefit.', 'Acne presents with both inflammatory and comedonal lesions in most patients.', 'Decreased waist circumference in the absence of weight loss can keep a patient motivated.', 'Causes of acute asthma include viral respiratory infections, acute allergen exposure, food allergies and some medications such as acetylsalicylic acid (aspirin) and non-steroidal anti-inflammatory drugs.', 'The slight differences between batches of biologics, or between biologics and biosimilars, are evaluated and not expected to have any meaningful effect in clinical use.', 'Unlike hemophilia A, the 

In [81]:
list_method = ['Sentence-Transformers', 'ROUGE']

for bull in bullet_examples:
    idx = df_st.loc[df_st.bullets == bull].index.tolist()[0]
    
    list_text = [\
        df_st.loc[df_st.bullets == bull, 'text'].tolist()[0],
        df_rouge.loc[df_rouge.bullets == bull, 'text'].tolist()[0]]
    
    list_text_num_tok = [\
        df_st.loc[df_st.bullets == bull, 'para_num_tokens'].tolist()[0],
        df_rouge.loc[df_rouge.bullets == bull, 'para_num_tokens'].tolist()[0]]
    
    nice_print(idx, bull, list_text, list_text_num_tok, list_method)

(9781908541468, 'ch_7')

Bullet:
Hypertension is both an important cause and a consequence of renal disease.

Sentence-Transformers (75):
Hypertension associated with parenchymal kidney disease represents a potent vicious cycle; it is
both a consequence of CKD and a cause of progressive kidney damage. Evidence from many studies shows
that treatment of hypertension is crucial to slowing progression of renal disease, particularly
among those with significant albuminuria (> 1 g/24 hours).

ROUGE (306):
Hypertension during pregnancy is defined as any rise in systolic blood pressure of more than 30 mmHg
or a rise in diastolic blood pressure of more than 15 mmHg above baseline, or the use of
antihypertensive agents. It is classified according to its presentation (Table 5.5). Chronic
hypertension is more common in multiparous women, and is present at the first antenatal visit. On
the other hand, pre-eclampsia is more common in primigravidas (in 10% of first pregnancies), and
represents an imp