In [1]:
magma_dir = '/home/marco/epfl/magma/'

### **Config**

In [2]:
import os
import sys

sys.path.insert(0, magma_dir)
import config

In [3]:
MODEL = 't5'

RE_SPLITTER = '\n'              # do we split sentences of paragraphs?
                                # use '\.(?!\d)|\n' or '\n', respectively

TOKEN_MAX_LEN = 99              # max length of a word
PARA_MIN_LENGTH = 2             # minimum length for a sentence or
                                # a paragraph, in tokens

RECALL_THRESHOLD = 0.7

# Output path
OUTPUT_PATH = magma_dir+'datasets/karger_books_para_wordembed/'+MODEL+'/'
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

### **Init**

In [4]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import re
import pandas as pd
import nltk
import gensim
from textwrap import fill
from tqdm import tqdm
tqdm.pandas()

if 'pegasus' in MODEL:
    from transformers import PegasusTokenizer
    tokenizer =\
        PegasusTokenizer.from_pretrained('google/pegasus-large')
elif 'bart' in MODEL:
    from transformers import BartTokenizer
    tokenizer =\
        BartTokenizer.from_pretrained('facebook/bart-large-cnn')
elif 't5' in MODEL:
    from transformers import T5Tokenizer
    tokenizer =\
        T5Tokenizer.from_pretrained('t5-large')

### **Karger Books Base Dataset**

In [5]:
base_dataset = magma_dir+'datasets/karger_books_base/df.csv'
df = pd.read_csv(base_dataset)
df = df.set_index(['book', 'chapter', 'section', 'subsection'])
df.bullets = df.bullets.map(eval, na_action='ignore')

### **Preprocessing**

#### Preprocessing

* Split based on RE_SPLITTER
* Explode the dataset
* Remove unwanted chars at beginning or end of sentence
* Remove multiple spaces
* Remove long words (> TOKEN_MAX_LEN chars)

In [6]:
# Split in sentences / paragraphs based on RE_SPLITTER
df.text =\
    df.text.map(lambda x: [p.strip() for p in re.split(RE_SPLITTER, x) if p!=''],
                na_action='ignore')
    
# explode to get one row for each paragraph /sentence
df = df.explode('text')
df = df.rename(columns={'text': 'para'})
df = df.dropna()

# Remove unwanted chars at beginning or end of sentence
df.para = df.para.map(lambda p: p.lstrip('.,;:-)] \n'))
df.para = df.para.map(lambda p: p.rstrip('.,;:-([ \n'))

# Remove multiple spaces
df.para = df.para.map(lambda p:
    re.sub('\s+', ' ', p).strip())

# Remove long words (> TOKEN_MAX_LEN chars)
def para2words(para):
    return gensim.utils.simple_preprocess(
        para, deacc=True, max_len=TOKEN_MAX_LEN)
df['para_proc'] = df.para.map(para2words)
df['bullets_proc'] = df.bullets.map(lambda bs: [para2words(b) for b in bs])

#### Further Preprocessing

* Remove stop words
* Remove short sentences / paragraphs (< PARA_MIN_LENGTH tokens)

In [7]:
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

df.para_proc = df.para_proc.map(lambda p:
    [w for w in p if w not in stop_words])
df.bullets_proc = df.bullets_proc.map(lambda bs:
    [[w for w in b if w not in stop_words] for b in bs])

[nltk_data] Error loading stopwords: <urlopen error [Errno -2] Name or
[nltk_data]     service not known>


In [8]:
# Remove short sentences / paragraphs (< PARA_MIN_LENGTH tokens)
df.loc[df.para_proc.map(len) <\
    PARA_MIN_LENGTH, 'para_proc'] = np.nan

df = df.dropna()

In [9]:
df.para = df.para.map(lambda p: p+'.')

### **Assign Bullets to Best Para and Expand Functions**

In [10]:
def assign_best_metric_para(df, col_metric):
    df['best_match'] = False

    for idx, para  in df.groupby('bullets').progress_apply(
        lambda g: g.iloc[g[col_metric].argmax()]).para.iteritems():
        
        df.loc[\
            (df['bullets'] == idx) &\
            (df['para'] == para), 'best_match'] = True
    
    para_too_short =\
        df[(df['compression_ratio'] >= config.MAX_RATIO) & df['best_match']]
    print('Percentage of paragraphs which are too short to be summarized: %.2f %%'\
        %(len(para_too_short)/len(df[df['best_match']])*100))
    
    return df

In [12]:
def expand_up_down(df, col_metric, verbose=False):
    # for each bullet
    for bul in tqdm(set(df.bullets.tolist())):
        if verbose : print(bul)
        df_bul = df[df['bullets'] == bul]
        
        # get book and chapter where this bullet is
        book = df_bul.index.get_level_values(0)[0]
        cpt = df_bul.index.get_level_values(1)[0]

        df_bul = df_bul.reset_index()
        if verbose : print(df_bul[df_bul['best_match']].para.tolist())
        # get best match index
        best_match_idx = np.where(df_bul['best_match'])[0][0]
        merged_para_idx = [best_match_idx]

        bul_num_tok = df_bul.loc[best_match_idx].bullets_num_tokens
        comp_ratio = df_bul.loc[best_match_idx].compression_ratio
        if verbose:
            print('Book %s, Chapter %s'%(book, cpt))
            print('Paragraphs in this chapter:', len(df_bul))
            print('Location of best_bul index:', best_match_idx)
            print('Compression ratio before merging: %.2f %%'%comp_ratio)
            print()
        while comp_ratio > config.MAX_RATIO:
            if 0 in merged_para_idx:
                if verbose : print('merge down')
                new_para_idx = max(merged_para_idx)+1
                df_bul.loc[new_para_idx, 'best_match'] = True
                merged_para_idx.append(new_para_idx)

            elif (len(df_bul)-1) in merged_para_idx:
                if verbose : print('merge up')
                new_para_idx = min(merged_para_idx)-1
                df_bul.loc[new_para_idx, 'best_match'] = True
                merged_para_idx.append(new_para_idx)

            else:
                if verbose : print('based on metric %s '%col_metric, end='')

                if df_bul.loc[min(merged_para_idx)-1, col_metric] <\
                    df_bul.loc[max(merged_para_idx)+1, col_metric]:
                    if verbose : print('merge down')
                    new_para_idx = max(merged_para_idx)+1
                    df_bul.loc[new_para_idx, 'best_match'] = True
                    merged_para_idx.append(new_para_idx)
                else:
                    if verbose : print('merge up')
                    new_para_idx = min(merged_para_idx)-1
                    df_bul.loc[new_para_idx, 'best_match'] = True
                    merged_para_idx.append(new_para_idx)         

            merged_para_len = np.sum(df_bul[df_bul['best_match']].para_num_tokens.tolist())
            comp_ratio = bul_num_tok / merged_para_len
            if merged_para_len > tokenizer.model_max_length:
                break
            if verbose:
                print(df_bul[df_bul['best_match']].para.tolist())
                print('Compression ratio: %.2f %%'%comp_ratio)
                print()

        for p, b in zip(df_bul.loc[merged_para_idx]['para'].tolist(),
            df_bul.loc[merged_para_idx]['bullets'].tolist()):
            df.loc[(df['para'] == p) &
                (df['bullets'] == b), 'best_match'] = True
        if verbose : print()
        
    return df

In [13]:
def print_stats(df):
    num_para_tot = 18822
    num_para_kept = np.sum(df.groupby('para')['best_match'].apply(np.any).tolist())
    print('%d out of %d paragraphs are considered using this method.'%(num_para_kept, num_para_tot), end=' ')
    print('Thus, %.2f %%'%(100*num_para_kept/num_para_tot))
    
    print()
    df_count_tokens = df.groupby('para', sort=False).agg({
        'best_match': lambda bm: np.any(list(bm)),
        'para_num_tokens': lambda pnt: list(pnt)[0]})
    num_tok_kept = df_count_tokens[df_count_tokens['best_match']].para_num_tokens.sum()
    num_tok_tot = df_count_tokens.para_num_tokens.sum()

    print('%d out of %d tokens are considered using this method.'%(num_tok_kept, num_tok_tot), end=' ')
    print('Thus, %.2f %%'%(100*num_tok_kept/num_tok_tot))

def print_stats_after_merge(df):
    para_too_short = df[df['compression_ratio'] > config.MAX_RATIO]
    print('Percentage of paragraphs which are too short to be summarized: %.2f %%'\
        %(len(para_too_short)/len(df)*100))
    
    print()
    print('Paragraphs which are too long to fit into the model: %d paragraphs.'%\
          len(df[df['para_num_tokens'] > tokenizer.model_max_length]))
    print(df[df['para_num_tokens'] > tokenizer.model_max_length])

### **Word2Vec Book Level**

#### Create Word Vectors

In [14]:
op = OUTPUT_PATH + 'w2v/'
if not os.path.exists(op):
    os.makedirs(op)

In [15]:
df_w2v = df.copy()

In [16]:
df_w2v_book = df_w2v.groupby('book', sort=False).agg({
    'para_proc': lambda pp: list(pp),
    'bullets_proc': lambda bp: list(bp)[0]
})

In [17]:
df_w2v_book['corpus'] = df_w2v_book.para_proc + df_w2v_book.bullets_proc

In [18]:
df_w2v_book['w2v'] = df_w2v_book.corpus.progress_map(lambda c:\
    gensim.models.Word2Vec(
        c,
        #size=128,
        #window=3,
        min_count=1,
        sg=1, # 1 for skip-gram; otherwise CBOW.
        seed = config.SEED))

100%|██████████| 53/53 [00:35<00:00,  1.49it/s]


In [19]:
def assign_word_vectors(r, col):
    book = r.name[0]
    wv = df_w2v_book.loc[book, 'w2v'].wv
    wv_list = []
    for x in r[col]:
        try:
            v = wv[x]
        except:
            continue
        wv_list.append(v)
    return wv_list

df_w2v['para_wv'] = df_w2v.progress_apply(lambda row: assign_word_vectors(row, 'para_proc'), axis=1)

# taking the average of the w2v vector of each paragraph
df_w2v.para_wv = df_w2v.para_wv.progress_map(lambda p_wv: np.mean(p_wv, axis=0))

100%|██████████| 18773/18773 [00:01<00:00, 13738.85it/s]
100%|██████████| 18773/18773 [00:00<00:00, 37845.82it/s]


In [20]:
print(df_w2v[df_w2v.para_wv.isna()])
df_w2v = df_w2v.dropna()

Empty DataFrame
Columns: [para, bullets, para_proc, bullets_proc, para_wv]
Index: []


#### Explode, preprocess, w2v bullets

In [21]:
df_w2v = df_w2v.explode('bullets')

df_w2v['bullets_proc'] = df_w2v.bullets.progress_map(para2words)
df_w2v.bullets_proc = df_w2v.bullets_proc.progress_map(lambda b:
    [w for w in b if w not in stop_words])

100%|██████████| 114277/114277 [00:06<00:00, 16395.64it/s]
100%|██████████| 114277/114277 [00:04<00:00, 24420.58it/s]


In [22]:
df_w2v['bullets_wv'] = df_w2v.progress_apply(lambda row: assign_word_vectors(row, 'bullets_proc'), axis=1)

# taking the average of the w2v vector of each bullet
df_w2v.bullets_wv = df_w2v.bullets_wv.progress_map(lambda b_wv: np.mean(b_wv, axis=0))

100%|██████████| 114277/114277 [00:05<00:00, 20827.49it/s]
100%|██████████| 114277/114277 [00:02<00:00, 50382.84it/s]


In [23]:
df_w2v['para_num_tokens'] = df_w2v.para.progress_map(lambda p: len(tokenizer.tokenize(p)))
df_w2v['bullets_num_tokens'] = df_w2v.bullets.progress_map(lambda b: len(tokenizer.tokenize(b)))

df_w2v['compression_ratio'] = df_w2v.bullets_num_tokens / df_w2v.para_num_tokens

100%|██████████| 114277/114277 [00:47<00:00, 2385.32it/s]
100%|██████████| 114277/114277 [00:37<00:00, 3042.28it/s]


#### Calculate cosine similarity between each couple bullet-para

In [24]:
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a)*np.linalg.norm(b))

In [25]:
df_w2v['cosine_sim'] = df_w2v[['para_wv', 'bullets_wv']].progress_apply(lambda row:\
    cosine_sim(row[0], row[1]), axis=1)

100%|██████████| 114277/114277 [00:03<00:00, 37387.53it/s]


#### Find Best Match and Expand

In [26]:
# find best match bullet-para for each bullet
df_w2v = assign_best_metric_para(df_w2v, 'cosine_sim')

100%|██████████| 2556/2556 [00:01<00:00, 1784.11it/s]


Percentage of paragraphs which are too short to be summarized: 55.44 %


In [27]:
df_w2v = expand_up_down(df_w2v, 'cosine_sim')

100%|██████████| 2556/2556 [01:36<00:00, 26.58it/s]


In [28]:
print_stats(df_w2v)

4031 out of 18822 paragraphs are considered using this method. Thus, 21.42 %

425779 out of 1493612 tokens are considered using this method. Thus, 28.51 %


In [29]:
df_w2v_merge = df_w2v[df_w2v['best_match']].reset_index().groupby(['book', 'chapter', 'bullets'], sort=False)\
.agg({
    'para': lambda p: ' '.join(list(p)),
    'para_num_tokens': sum,
    'bullets_num_tokens': lambda bnt: list(bnt)[0]
}).reset_index(level='bullets')
df_w2v_merge = df_w2v_merge.rename(columns={'para': 'text'})

df_w2v_merge['compression_ratio'] = df_w2v_merge.bullets_num_tokens / df_w2v_merge.para_num_tokens

In [30]:
print_stats_after_merge(df_w2v_merge)

Percentage of paragraphs which are too short to be summarized: 0.31 %

Paragraphs which are too long to fit into the model: 24 paragraphs.
                                                                 bullets  \
book          chapter                                                      
9781908541024 ch_6     Concurrent and adjuvant temozolomide chemother...   
              ch_6     Anaplastic oligodendroglioma is a chemosensiti...   
              ch_6     Observation may be an appropriate initial stra...   
9781908541062 ch_10    Nine biological therapies are now licensed for...   
9781908541086 ch_5     Clinicians and patients are best served by con...   
              ch_11    Eating disorders not otherwise specified is th...   
9781908541727 ch07     Urgently refer patients who present with any o...   
              ch09     Refer urgently (the same day), patients with o...   
              ch10     Refer urgently, to be seen the same day, if yo...   
              ch10     Re

In [31]:
df_w2v_merge[df_w2v_merge['para_num_tokens'] > tokenizer.model_max_length].para_num_tokens.describe()

count     24.000000
mean     559.916667
std       44.426702
min      514.000000
25%      524.500000
50%      548.000000
75%      583.000000
max      684.000000
Name: para_num_tokens, dtype: float64

#### Save dataset

In [32]:
df_w2v_merge.to_csv(op+'df.csv')

#### Create train, test, validation

In [33]:
df_w2v_merge = df_w2v_merge.groupby(level=[0, 1], sort=False).agg({
    'bullets': lambda b: list(b),
    'text': lambda t: list(t),
})

In [34]:
df_w2v_merge = df_w2v_merge.sample(frac=1, random_state=config.SEED)
df_w2v_merge['num_bulls'] = df_w2v_merge.bullets.map(len).cumsum()
tot_bulls = df_w2v_merge.num_bulls.iloc[-1]
split1 = np.where(df_w2v_merge.num_bulls > int(tot_bulls*0.8))[0][0]+1
split2 = np.where(df_w2v_merge.num_bulls > int(tot_bulls*0.9))[0][0]+1
print(split1, split2)

361 408


In [35]:
train, val, test =\
    df_w2v_merge.iloc[:split1].explode('bullets'),\
    df_w2v_merge.iloc[split1:split2].explode('bullets'),\
    df_w2v_merge.iloc[split2:].explode('bullets')

train['text'] = df_w2v_merge.iloc[:split1].explode('text')['text']
val['text'] = df_w2v_merge.iloc[split1:split2].explode('text')['text']
test['text'] = df_w2v_merge.iloc[split2:].explode('text')['text']

train.to_csv(op+'train.csv')
val.to_csv(op+'val.csv')
test.to_csv(op+'test.csv')

In [36]:
with open(op+'train.source', 'w') as tr_s,\
    open(op+'train.target', 'w') as tr_t,\
    open(op+'train.index', 'w') as tr_i:
    for idx, row in train[['text', 'bullets']].iterrows():
        tr_i.write(str(idx) + '\n')
        tr_s.write(row.text + '\n')
        tr_t.write(row.bullets + '\n')
        
with open(op+'val.source', 'w') as va_s,\
    open(op+'val.target', 'w') as va_t,\
    open(op+'val.index', 'w') as va_i:
    for idx, row in val[['text', 'bullets']].iterrows():
        va_i.write(str(idx) + '\n')
        va_s.write(row.text + '\n')
        va_t.write(row.bullets + '\n')
        
with open(op+'test.source', 'w') as te_s,\
    open(op+'test.target', 'w') as te_t,\
    open(op+'test.index', 'w') as te_i:
    for idx, row in test[['text', 'bullets']].iterrows():
        te_i.write(str(idx) + '\n')
        te_s.write(row.text + '\n')
        te_t.write(row.bullets + '\n')

### **Doc2Vec Book Level**

#### Create Doc Vectors

In [37]:
op = OUTPUT_PATH + 'd2v/'
if not os.path.exists(op):
    os.makedirs(op)

In [38]:
df_d2v = df.copy()

In [39]:
df_d2v_book = df_d2v.groupby('book', sort=False).agg({
    'para_proc': lambda pp: list(pp),
    'bullets_proc': lambda bp: list(bp)[0]
})

In [40]:
df_d2v_book['corpus'] = df_d2v_book.para_proc + df_d2v_book.bullets_proc
df_d2v_book['tagged_corpus'] = df_d2v_book.corpus.map(lambda c:
    [gensim.models.doc2vec.TaggedDocument(para, [i]) for i, para in enumerate(c)])

In [41]:
df_d2v_book['d2v'] = df_d2v_book.tagged_corpus.progress_map(lambda tc:\
    gensim.models.Doc2Vec(
        tc,
        dm=1, # 1 for PV-DM; otherwise PV-DBOW
        #vector_size=128,
        #window=3,
        #epochs=5,
        min_count=1,
        seed = config.SEED))

100%|██████████| 53/53 [00:40<00:00,  1.30it/s]


#### Explode and preprocess bullets

In [42]:
df_d2v = df_d2v.explode('bullets')

df_d2v['bullets_proc'] = df_d2v.bullets.progress_map(para2words)
df_d2v.bullets_proc = df_d2v.bullets_proc.progress_map(lambda b:
    [w for w in b if w not in stop_words])

100%|██████████| 114277/114277 [00:08<00:00, 14204.95it/s]
100%|██████████| 114277/114277 [00:05<00:00, 19101.35it/s]


In [43]:
df_d2v['para_num_tokens'] = df_d2v.para.progress_map(lambda p: len(tokenizer.tokenize(p)))
df_d2v['bullets_num_tokens'] = df_d2v.bullets.progress_map(lambda b: len(tokenizer.tokenize(b)))

df_d2v['compression_ratio'] = df_d2v.bullets_num_tokens / df_d2v.para_num_tokens

100%|██████████| 114277/114277 [00:48<00:00, 2342.25it/s]
100%|██████████| 114277/114277 [00:46<00:00, 2439.71it/s]


#### Calculate similarity between each couple bullet-para

In [44]:
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a)*np.linalg.norm(b))

def d2v_similarity(r):
    book = r.name[0]
    d2v = df_d2v_book.loc[book, 'd2v']
    dv_para = d2v.infer_vector(r.para_proc)
    dv_bullets = d2v.infer_vector(r.bullets_proc)
    
    return cosine_sim(dv_para, dv_bullets)
    
df_d2v['d2v_sim'] = df_d2v.progress_apply(lambda row: d2v_similarity(row), axis=1)

100%|██████████| 114277/114277 [03:21<00:00, 566.75it/s]


#### Find Best Match and Expand

In [45]:
# find best match bullet-para for each bullet
df_d2v = assign_best_metric_para(df_d2v, 'd2v_sim')

100%|██████████| 2556/2556 [00:01<00:00, 1306.81it/s]


Percentage of paragraphs which are too short to be summarized: 54.85 %


In [46]:
df_d2v_expand = expand_up_down(df_d2v, 'd2v_sim')

100%|██████████| 2556/2556 [01:42<00:00, 25.03it/s]


In [47]:
print_stats(df_d2v)

4013 out of 18822 paragraphs are considered using this method. Thus, 21.32 %

428408 out of 1493612 tokens are considered using this method. Thus, 28.68 %


In [48]:
df_d2v_merge = df_d2v[df_d2v['best_match']].reset_index().groupby(['book', 'chapter', 'bullets'], sort=False)\
.agg({
    'para': lambda p: ' '.join(list(p)),
    'para_num_tokens': sum,
    'bullets_num_tokens': lambda bnt: list(bnt)[0]
}).reset_index(level='bullets')
df_d2v_merge = df_d2v_merge.rename(columns={'para': 'text'})

df_d2v_merge['compression_ratio'] = df_d2v_merge.bullets_num_tokens / df_d2v_merge.para_num_tokens

In [49]:
print_stats_after_merge(df_d2v_merge)

Percentage of paragraphs which are too short to be summarized: 0.35 %

Paragraphs which are too long to fit into the model: 26 paragraphs.
                                                                 bullets  \
book          chapter                                                      
9781908541024 ch_6     Concurrent and adjuvant temozolomide chemother...   
9781908541277 ch_11    Oral contraceptives containing at least 50 µg ...   
9781908541406 ch_4     Specific inquiry should be made about oral ant...   
              ch_15    The direct factor Xa and thrombin inhibitors c...   
9781908541727 ch07     Urgently refer patients who present with any o...   
              ch09     Refer urgently (the same day), patients with o...   
              ch10     Refer urgently, to be seen the same day, if yo...   
              ch10     Refer within 1 week, any child with:- squint (...   
              ch11     Refer urgently, to be seen by an ophthalmologi...   
9781910797006 ch01     Ce

#### Save dataset

In [50]:
df_d2v_merge.to_csv(op+'df.csv')

#### Create train, test, validation

In [51]:
df_d2v_merge = df_d2v_merge.groupby(level=[0, 1], sort=False).agg({
    'bullets': lambda b: list(b),
    'text': lambda t: list(t),
})

In [52]:
df_d2v_merge = df_d2v_merge.sample(frac=1, random_state=config.SEED)
df_d2v_merge['num_bulls'] = df_d2v_merge.bullets.map(len).cumsum()
tot_bulls = df_d2v_merge.num_bulls.iloc[-1]
split1 = np.where(df_d2v_merge.num_bulls > int(tot_bulls*0.8))[0][0]+1
split2 = np.where(df_d2v_merge.num_bulls > int(tot_bulls*0.9))[0][0]+1
print(split1, split2)

361 408


In [53]:
train, val, test =\
    df_d2v_merge.iloc[:split1].explode('bullets'),\
    df_d2v_merge.iloc[split1:split2].explode('bullets'),\
    df_d2v_merge.iloc[split2:].explode('bullets')

train['text'] = df_d2v_merge.iloc[:split1].explode('text')['text']
val['text'] = df_d2v_merge.iloc[split1:split2].explode('text')['text']
test['text'] = df_d2v_merge.iloc[split2:].explode('text')['text']

train.to_csv(op+'train.csv')
val.to_csv(op+'val.csv')
test.to_csv(op+'test.csv')

In [54]:
with open(op+'train.source', 'w') as tr_s,\
    open(op+'train.target', 'w') as tr_t,\
    open(op+'train.index', 'w') as tr_i:
    for idx, row in train[['text', 'bullets']].iterrows():
        tr_i.write(str(idx) + '\n')
        tr_s.write(row.text + '\n')
        tr_t.write(row.bullets + '\n')
        
with open(op+'val.source', 'w') as va_s,\
    open(op+'val.target', 'w') as va_t,\
    open(op+'val.index', 'w') as va_i:
    for idx, row in val[['text', 'bullets']].iterrows():
        va_i.write(str(idx) + '\n')
        va_s.write(row.text + '\n')
        va_t.write(row.bullets + '\n')
        
with open(op+'test.source', 'w') as te_s,\
    open(op+'test.target', 'w') as te_t,\
    open(op+'test.index', 'w') as te_i:
    for idx, row in test[['text', 'bullets']].iterrows():
        te_i.write(str(idx) + '\n')
        te_s.write(row.text + '\n')
        te_t.write(row.bullets + '\n')

### **Sentence-Transformers Book Level**

In [55]:
op = OUTPUT_PATH + 'st/'
if not os.path.exists(op):
    os.makedirs(op)

In [56]:
df_st = df.copy()

#### Create embedding vectors for para

In [57]:
from sentence_transformers import SentenceTransformer

# might want to try 'msmarco-distilbert-base-v2' too
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

df_st['para_enc'] = df_st.para.progress_map(model.encode)

100%|██████████| 18773/18773 [23:30<00:00, 13.31it/s]


#### Explode bullets

In [58]:
df_st = df_st.explode('bullets')

In [59]:
df_st['para_num_tokens'] = df_st.para.progress_map(lambda p: len(tokenizer.tokenize(p)))
df_st['bullets_num_tokens'] = df_st.bullets.progress_map(lambda b: len(tokenizer.tokenize(b)))

df_st['compression_ratio'] = df_st.bullets_num_tokens / df_st.para_num_tokens

100%|██████████| 114277/114277 [00:51<00:00, 2225.06it/s]
100%|██████████| 114277/114277 [00:42<00:00, 2699.42it/s]


#### Create embedding vectors for bullets

In [60]:
bull_to_embed = df_st.groupby(['book', 'chapter'], sort=False).agg({
    'bullets': lambda b: list(set(b))
}).explode('bullets')

bull_to_embed['bullets_enc'] = bull_to_embed.bullets.progress_map(model.encode)

100%|██████████| 2556/2556 [01:53<00:00, 22.60it/s]


#### Calculate similarity between each couple bullet-para

In [61]:
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a)*np.linalg.norm(b))

def sentence_transformers_sim(r):
    book = r.name[0]
    b2e = bull_to_embed.loc[book]
    para_enc = r.para_enc
    bullets_enc = b2e.loc[(b2e.bullets == r.bullets), 'bullets_enc']
    assert len(bullets_enc) == 1
    bullets_enc = bullets_enc[0]
    
    return cosine_sim(para_enc, bullets_enc)
    
df_st['st_sim'] = df_st.progress_apply(sentence_transformers_sim, axis=1)

100%|██████████| 114277/114277 [01:29<00:00, 1274.74it/s]


#### Find Best Match and Expand

In [62]:
# find best match bullet-para for each bullet
df_st = assign_best_metric_para(df_st, 'st_sim')

100%|██████████| 2556/2556 [00:01<00:00, 2179.31it/s]


Percentage of paragraphs which are too short to be summarized: 64.01 %


In [63]:
df_st = expand_up_down(df_st, 'st_sim')

100%|██████████| 2556/2556 [01:52<00:00, 22.69it/s]


In [64]:
print_stats(df_st)

4560 out of 18822 paragraphs are considered using this method. Thus, 24.23 %

430587 out of 1493612 tokens are considered using this method. Thus, 28.83 %


In [65]:
df_st_merge = df_st[df_st['best_match']].reset_index().groupby(['book', 'chapter', 'bullets'], sort=False)\
.agg({
    'para': lambda p: ' '.join(list(p)),
    'para_num_tokens': sum,
    'bullets_num_tokens': lambda bnt: list(bnt)[0]
}).reset_index(level='bullets')
df_st_merge = df_st_merge.rename(columns={'para': 'text'})

df_st_merge['compression_ratio'] = df_st_merge.bullets_num_tokens / df_st_merge.para_num_tokens

In [66]:
print_stats_after_merge(df_st_merge)

Percentage of paragraphs which are too short to be summarized: 0.27 %

Paragraphs which are too long to fit into the model: 26 paragraphs.
                                                                 bullets  \
book          chapter                                                      
9781908541024 ch_6     Concurrent and adjuvant temozolomide chemother...   
              ch_6     Observation may be an appropriate initial stra...   
9781908541086 ch_11    Eating disorders not otherwise specified is th...   
9781908541277 ch_11    Oral contraceptives containing at least 50 µg ...   
9781908541420 ch_6     Drugs used in the management of asthma can be ...   
9781908541727 ch07     Urgently refer patients who present with any o...   
              ch09     Refer urgently (the same day), patients with o...   
              ch10     Refer urgently, to be seen the same day, if yo...   
              ch10     Refer within 1 week, any child with:- squint (...   
              ch11     Re

#### Save dataset

In [67]:
df_st_merge.to_csv(op+'df.csv')

#### Create train, test, validation

In [68]:
df_st_merge = df_st_merge.groupby(level=[0, 1], sort=False).agg({
    'bullets': lambda b: list(b),
    'text': lambda t: list(t),
})

In [69]:
df_st_merge = df_st_merge.sample(frac=1, random_state=config.SEED)
df_st_merge['num_bulls'] = df_st_merge.bullets.map(len).cumsum()
tot_bulls = df_st_merge.num_bulls.iloc[-1]
split1 = np.where(df_st_merge.num_bulls > int(tot_bulls*0.8))[0][0]+1
split2 = np.where(df_st_merge.num_bulls > int(tot_bulls*0.9))[0][0]+1
print(split1, split2)

361 408


In [70]:
train, val, test =\
    df_st_merge.iloc[:split1].explode('bullets'),\
    df_st_merge.iloc[split1:split2].explode('bullets'),\
    df_st_merge.iloc[split2:].explode('bullets')

train['text'] = df_st_merge.iloc[:split1].explode('text')['text']
val['text'] = df_st_merge.iloc[split1:split2].explode('text')['text']
test['text'] = df_st_merge.iloc[split2:].explode('text')['text']

train.to_csv(op+'train.csv')
val.to_csv(op+'val.csv')
test.to_csv(op+'test.csv')

In [71]:
with open(op+'train.source', 'w') as tr_s,\
    open(op+'train.target', 'w') as tr_t,\
    open(op+'train.index', 'w') as tr_i:
    for idx, row in train[['text', 'bullets']].iterrows():
        tr_i.write(str(idx) + '\n')
        tr_s.write(row.text + '\n')
        tr_t.write(row.bullets + '\n')
        
with open(op+'val.source', 'w') as va_s,\
    open(op+'val.target', 'w') as va_t,\
    open(op+'val.index', 'w') as va_i:
    for idx, row in val[['text', 'bullets']].iterrows():
        va_i.write(str(idx) + '\n')
        va_s.write(row.text + '\n')
        va_t.write(row.bullets + '\n')
        
with open(op+'test.source', 'w') as te_s,\
    open(op+'test.target', 'w') as te_t,\
    open(op+'test.index', 'w') as te_i:
    for idx, row in test[['text', 'bullets']].iterrows():
        te_i.write(str(idx) + '\n')
        te_s.write(row.text + '\n')
        te_t.write(row.bullets + '\n')

### **Print Some Examples**

In [70]:
def nice_print(idx, bull, list_text, list_text_num_tok, list_method):
    print(idx)
    print()
    print('Bullet:')
    print(fill(bull, 100))
    print()
    for t, tok, m in zip(list_text, list_text_num_tok, list_method):
        print(m+' (' +str(tok)+'):')
        print(fill(t, 100))
        print()
    
    print(''.join(['#']*100))
    print()

#### W2V vs D2V vs Sentence-Transformers vs Rouge

In [72]:
import random

df_w2v = pd.read_csv(OUTPUT_PATH+'w2v/df.csv').set_index(['book', 'chapter'])
df_d2v = pd.read_csv(OUTPUT_PATH+'d2v/df.csv').set_index(['book', 'chapter'])
df_st = pd.read_csv(OUTPUT_PATH+'st/df.csv').set_index(['book', 'chapter'])
df_rouge = pd.read_csv(magma_dir+'datasets/karger_books_para/'+MODEL+'/df.csv').set_index(['book', 'chapter'])

random.seed(config.SEED)

bullet_examples = random.sample(df_w2v.bullets.tolist(), 10)
print(bullet_examples)

['Pregnant women with significant renal impairment are likely to have hypertension, pre-eclampsia and premature labor.', 'Neuronal tumors are uncommon brain neoplasms typically diagnosed in children and young adults.', 'The prevention of nausea has been much less successful with currently approved agents.', 'Lung transplantation in patients with very advanced COPD improves health status and functional capacity, though it does not convey a survival benefit.', 'Acne scarring is a very common sequel to acne.', 'Decreased waist circumference in the absence of weight loss can keep a patient motivated.', 'Causes of acute asthma include viral respiratory infections, acute allergen exposure, food allergies and some medications such as acetylsalicylic acid (aspirin) and non-steroidal anti-inflammatory drugs.', 'Biosimilars are highly similar, but not identical, to their reference (originator) biologic.', 'FXI concentrates should be used with caution, because they predispose to arterial and veno

In [73]:
list_method = ['W2V', 'D2V', 'Sentence-Transformers', 'ROUGE']

for bull in bullet_examples:
    idx = df_w2v.loc[df_w2v.bullets == bull].index.tolist()[0]
    
    list_text = [df_w2v.loc[df_w2v.bullets == bull, 'text'].tolist()[0],
        df_d2v.loc[df_d2v.bullets == bull, 'text'].tolist()[0],
        df_st.loc[df_st.bullets == bull, 'text'].tolist()[0],
        df_rouge.loc[df_rouge.bullets == bull, 'text'].tolist()[0]]
    
    list_text_num_tok = [df_w2v.loc[df_w2v.bullets == bull, 'para_num_tokens'].tolist()[0],
        df_d2v.loc[df_d2v.bullets == bull, 'para_num_tokens'].tolist()[0],
        df_st.loc[df_st.bullets == bull, 'para_num_tokens'].tolist()[0],
        df_rouge.loc[df_rouge.bullets == bull, 'para_num_tokens'].tolist()[0]]
    
    nice_print(idx, bull, list_text, list_text_num_tok, list_method)

(9781908541468, 'ch_7')

Bullet:
Pregnant women with significant renal impairment are likely to have hypertension, pre-eclampsia and
premature labor.

W2V (229):
Renovascular disease is a remediable form of hypertension and should be excluded in high-risk
patients, such as elderly hypertensives with evidence of diffuse atherosclerosis, in refractory or
malignant hypertension, in those with 'flash' pulmonary edema and in individuals with an abdominal
bruit. In young women with hypertension of recent onset, fibromuscular renal artery disease should
be excluded. The preferred diagnostic tests include magnetic resonance angiography and ACE-inhibitor
renography. Duplex ultrasonography with Doppler flow measurements can be a useful screening test but
is rather operator dependent, and in many patients the renal arteries cannot be visualized. The
definitive diagnostic tests in almost all patients are still digital subtraction angiography or
arteriography (Figure 5.3), but both carry a risk of 