In [1]:
magma_dir = '/home/marco/epfl/magma/'

### **Config**

In [2]:
import os
import sys

sys.path.insert(0, magma_dir)
import config

In [3]:
MODEL = 'pegasus'

RE_SPLITTER = '\n'              # do we split sentences of paragraphs?
                                # use '\.(?!\d)|\n' or '\n', respectively

TOKEN_MAX_LEN = 99              # max length of a word
PARA_MIN_LENGTH = 2             # minimum length for a sentence or
                                # a paragraph, in tokens

RECALL_THRESHOLD = 0.7

# Output path
OUTPUT_PATH = magma_dir+'datasets/karger_books_para_wordembed/'+MODEL+'/'
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

### **Init**

In [4]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import re
import pandas as pd
import nltk
import gensim
from textwrap import fill
from tqdm import tqdm
tqdm.pandas()

if 'pegasus' in MODEL:
    from transformers import PegasusTokenizer
    tokenizer =\
        PegasusTokenizer.from_pretrained('google/pegasus-large')
elif 'bart' in MODEL:
    from transformers import BartTokenizer
    tokenizer =\
        BartTokenizer.from_pretrained('facebook/bart-large-cnn')
elif 't5' in MODEL:
    from transformers import T5Tokenizer
    tokenizer =\
        T5Tokenizer.from_pretrained('t5-large')

## **Karger Books Base Dataset**

In [5]:
base_dataset = magma_dir+'datasets/karger_books_base/df.csv'
df = pd.read_csv(base_dataset)
df = df.set_index(['book', 'chapter', 'section', 'subsection'])
df.bullets = df.bullets.map(eval, na_action='ignore')

## **Preprocessing**

#### Preprocessing

* Split based on RE_SPLITTER
* Explode the dataset
* Remove unwanted chars at beginning or end of sentence
* Remove multiple spaces
* Remove long words (> TOKEN_MAX_LEN chars)

In [6]:
# Split in sentences / paragraphs based on RE_SPLITTER
df.text =\
    df.text.map(lambda x: [p.strip() for p in re.split(RE_SPLITTER, x) if p!=''],
                na_action='ignore')
    
# explode to get one row for each paragraph /sentence
df = df.explode('text')
df = df.rename(columns={'text': 'para'})
df = df.dropna()

# Remove unwanted chars at beginning or end of sentence
df.para = df.para.map(lambda p: p.lstrip('.,;:-)] \n'))
df.para = df.para.map(lambda p: p.rstrip('.,;:-([ \n'))

# Remove multiple spaces
df.para = df.para.map(lambda p:
    re.sub('\s+', ' ', p).strip())

# Remove long words (> TOKEN_MAX_LEN chars)
def para2words(para):
    return gensim.utils.simple_preprocess(
        para, deacc=True, max_len=TOKEN_MAX_LEN)
df['para_proc'] = df.para.map(para2words)
df['bullets_proc'] = df.bullets.map(lambda bs: [para2words(b) for b in bs])

#### Further Preprocessing

* Remove stop words
* Remove short sentences / paragraphs (< PARA_MIN_LENGTH tokens)

In [7]:
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

df.para_proc = df.para_proc.map(lambda p:
    [w for w in p if w not in stop_words])
df.bullets_proc = df.bullets_proc.map(lambda bs:
    [[w for w in b if w not in stop_words] for b in bs])

[nltk_data] Downloading package stopwords to /home/marco/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Remove short sentences / paragraphs (< PARA_MIN_LENGTH tokens)
df.loc[df.para_proc.map(len) <\
    PARA_MIN_LENGTH, 'para_proc'] = np.nan

df = df.dropna()

In [9]:
df.para = df.para.map(lambda p: p+'.')

## **Assign Bullets to Best Para and Expand Functions**

In [10]:
def assign_best_metric_para(df, col_metric):
    df['best_match'] = False

    for idx, para  in df.groupby('bullets').progress_apply(
        lambda g: g.iloc[g[col_metric].argmax()]).para.iteritems():
        
        df.loc[\
            (df['bullets'] == idx) &\
            (df['para'] == para), 'best_match'] = True
    
    para_too_short =\
        df[(df['compression_ratio'] >= config.MAX_RATIO) & df['best_match']]
    print('Percentage of paragraphs which are too short to be summarized: %.2f %%'\
        %(len(para_too_short)/len(df[df['best_match']])*100))
    
    return df

In [11]:
def expand_up_down(df, col_metric):
    # for each bullet
    for bul in tqdm(set(df.bullets.tolist())):
        df_bul = df[df['bullets'] == bul]
        
        # get book and chapter where this bullet is
        book = df_bul.index.get_level_values(0)[0]
        cpt = df_bul.index.get_level_values(1)[0]

        df_bul = df_bul.reset_index()
        # get best match index
        best_match_idx = np.where(df_bul['best_match'])[0][0]
        merged_para_idx = [best_match_idx]

        bul_num_tok = df_bul.loc[best_match_idx, 'bullets_num_tokens']
        merged_para_num_tok = df_bul.loc[best_match_idx, 'para_num_tokens']
        comp_ratio = df_bul.loc[best_match_idx, 'compression_ratio']
        num_bul_cpt = len(set(df.loc[book, cpt].bullets.tolist()))
        max_idx = len(df_bul)-1
        
        while comp_ratio > config.MAX_RATIO and\
            merged_para_num_tok < tokenizer.model_max_length:
            
            # if we already merged all possible paragraphs
            if (0 in merged_para_idx) and (max_idx in merged_para_idx):
                break
                
            # if we already merged the first paragraph
            elif 0 in merged_para_idx:
                new_para_idx = max(merged_para_idx)+1
                
            # if we already merged the last paragraph
            elif max_idx in merged_para_idx:
                new_para_idx = min(merged_para_idx)-1
                
            # otherwise check for best metric inclusion
            else:
                if df_bul.loc[min(merged_para_idx)-1, col_metric] <\
                    df_bul.loc[max(merged_para_idx)+1, col_metric]:
                    # merge down
                    new_para_idx = max(merged_para_idx)+1
                    
                else: # merge up
                    new_para_idx = min(merged_para_idx)-1       

            df_bul.loc[new_para_idx, 'best_match'] = True
            merged_para_idx.append(new_para_idx)
            
            merged_para_num_tok += df_bul.loc[new_para_idx, 'para_num_tokens']
            comp_ratio = bul_num_tok / merged_para_num_tok

        for p, b in zip(df_bul.loc[merged_para_idx]['para'].tolist(),
            df_bul.loc[merged_para_idx]['bullets'].tolist()):
            df.loc[(df['para'] == p) &
                (df['bullets'] == b), 'best_match'] = True
        
    return df

In [12]:
def expand_up_down_numtok_th(df, col_metric):
    # for each bullet
    for bul in tqdm(set(df.bullets.tolist())):
        df_bul = df[df['bullets'] == bul]
        
        # get book and chapter where this bullet is
        book = df_bul.index.get_level_values(0)[0]
        cpt = df_bul.index.get_level_values(1)[0]

        df_bul = df_bul.reset_index()
        # get best match index
        best_match_idx = np.where(df_bul['best_match'])[0][0]
        merged_para_idx = [best_match_idx]

        bul_num_tok = df_bul.loc[best_match_idx, 'bullets_num_tokens']
        merged_para_num_tok = df_bul.loc[best_match_idx, 'para_num_tokens']
        num_bul_cpt = len(set(df.loc[book, cpt].bullets.tolist()))
        num_tok_threshold = min(df_bul.para_num_tokens.sum() / num_bul_cpt,
                                0.9*tokenizer.model_max_length)
        max_idx = len(df_bul)-1
        
        while merged_para_num_tok < num_tok_threshold:
            
            # if we already merged all possible paragraphs
            if (0 in merged_para_idx) and (max_idx in merged_para_idx):
                break
                
            # if we already merged the first paragraph
            elif 0 in merged_para_idx:
                new_para_idx = max(merged_para_idx)+1
                
            # if we already merged the last paragraph
            elif max_idx in merged_para_idx:
                new_para_idx = min(merged_para_idx)-1
                
            # otherwise check for best metric inclusion
            else:
                if df_bul.loc[min(merged_para_idx)-1, col_metric] <\
                    df_bul.loc[max(merged_para_idx)+1, col_metric]:
                    # merge down
                    new_para_idx = max(merged_para_idx)+1
                else: # merge up
                    new_para_idx = min(merged_para_idx)-1   

            df_bul.loc[new_para_idx, 'best_match'] = True
            merged_para_idx.append(new_para_idx)
            merged_para_num_tok += df_bul.loc[new_para_idx, 'para_num_tokens']

        for p, b in zip(df_bul.loc[merged_para_idx]['para'].tolist(),
            df_bul.loc[merged_para_idx]['bullets'].tolist()):
            df.loc[(df['para'] == p) &
                (df['bullets'] == b), 'best_match'] = True
        
    return df

In [13]:
def print_stats(df):
    num_para_tot = 18822
    num_para_kept = np.sum(df.groupby('para')['best_match'].apply(np.any).tolist())
    print('%d out of %d paragraphs are considered using this method.'%(num_para_kept, num_para_tot), end=' ')
    print('Thus, %.2f %%'%(100*num_para_kept/num_para_tot))
    
    print()
    df_count_tokens = df.groupby('para', sort=False).agg({
        'best_match': lambda bm: np.any(list(bm)),
        'para_num_tokens': lambda pnt: list(pnt)[0]})
    num_tok_kept = df_count_tokens[df_count_tokens['best_match']].para_num_tokens.sum()
    num_tok_tot = df_count_tokens.para_num_tokens.sum()

    print('%d out of %d tokens are considered using this method.'%(num_tok_kept, num_tok_tot), end=' ')
    print('Thus, %.2f %%'%(100*num_tok_kept/num_tok_tot))

def print_stats_after_merge(df):
    para_too_short = df[df['compression_ratio'] > config.MAX_RATIO]
    print('Percentage of paragraphs which are too short to be summarized: %.2f %%'\
        %(len(para_too_short)/len(df)*100))
    
    print()
    print('Paragraphs which are too long to fit into the model: %d paragraphs.'%\
          len(df[df['para_num_tokens'] > tokenizer.model_max_length]))
    print(df[df['para_num_tokens'] > tokenizer.model_max_length])

## **Word2Vec Book Level**

#### Create Word Vectors

In [14]:
op = OUTPUT_PATH + 'w2v/'
if not os.path.exists(op):
    os.makedirs(op)

In [15]:
df_w2v = df.copy()

In [16]:
df_w2v_book = df_w2v.groupby('book', sort=False).agg({
    'para_proc': lambda pp: list(pp),
    'bullets_proc': lambda bp: list(bp)[0]
})

In [17]:
df_w2v_book['corpus'] = df_w2v_book.para_proc + df_w2v_book.bullets_proc

In [18]:
df_w2v_book['w2v'] = df_w2v_book.corpus.progress_map(lambda c:\
    gensim.models.Word2Vec(
        c,
        #size=128,
        #window=3,
        min_count=1,
        sg=1, # 1 for skip-gram; otherwise CBOW.
        seed = config.SEED))

100%|██████████| 53/53 [00:38<00:00,  1.38it/s]


In [19]:
def assign_word_vectors(r, col):
    book = r.name[0]
    wv = df_w2v_book.loc[book, 'w2v'].wv
    wv_list = []
    for x in r[col]:
        try:
            v = wv[x]
        except:
            continue
        wv_list.append(v)
    return wv_list

df_w2v['para_wv'] = df_w2v.progress_apply(lambda row: assign_word_vectors(row, 'para_proc'), axis=1)

# taking the average of the w2v vector of each paragraph
df_w2v.para_wv = df_w2v.para_wv.progress_map(lambda p_wv: np.mean(p_wv, axis=0))

100%|██████████| 18773/18773 [00:01<00:00, 11339.62it/s]
100%|██████████| 18773/18773 [00:00<00:00, 31967.78it/s]


In [20]:
print(df_w2v[df_w2v.para_wv.isna()])
df_w2v = df_w2v.dropna()

Empty DataFrame
Columns: [para, bullets, para_proc, bullets_proc, para_wv]
Index: []


#### Explode, preprocess, w2v bullets

In [21]:
df_w2v = df_w2v.explode('bullets')

df_w2v['bullets_proc'] = df_w2v.bullets.progress_map(para2words)
df_w2v.bullets_proc = df_w2v.bullets_proc.progress_map(lambda b:
    [w for w in b if w not in stop_words])

100%|██████████| 114277/114277 [00:10<00:00, 10942.81it/s]
100%|██████████| 114277/114277 [00:06<00:00, 16737.64it/s]


In [22]:
df_w2v['bullets_wv'] = df_w2v.progress_apply(lambda row: assign_word_vectors(row, 'bullets_proc'), axis=1)

# taking the average of the w2v vector of each bullet
df_w2v.bullets_wv = df_w2v.bullets_wv.progress_map(lambda b_wv: np.mean(b_wv, axis=0))

100%|██████████| 114277/114277 [00:06<00:00, 18080.65it/s]
100%|██████████| 114277/114277 [00:02<00:00, 42885.92it/s]


In [23]:
df_w2v['para_num_tokens'] = df_w2v.para.progress_map(lambda p: len(tokenizer.tokenize(p)))
df_w2v['bullets_num_tokens'] = df_w2v.bullets.progress_map(lambda b: len(tokenizer.tokenize(b)))

df_w2v['compression_ratio'] = df_w2v.bullets_num_tokens / df_w2v.para_num_tokens

100%|██████████| 114277/114277 [00:55<00:00, 2062.96it/s]
100%|██████████| 114277/114277 [00:46<00:00, 2453.12it/s]


#### Calculate cosine similarity between each couple bullet-para

In [24]:
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a)*np.linalg.norm(b))

In [25]:
df_w2v['cosine_sim'] = df_w2v[['para_wv', 'bullets_wv']].progress_apply(lambda row:\
    cosine_sim(row[0], row[1]), axis=1)

100%|██████████| 114277/114277 [00:04<00:00, 25128.01it/s]


#### Find Best Match and Expand

In [26]:
# find best match bullet-para for each bullet
df_w2v = assign_best_metric_para(df_w2v, 'cosine_sim')

100%|██████████| 2556/2556 [00:01<00:00, 1651.68it/s]


Percentage of paragraphs which are too short to be summarized: 53.79 %


In [27]:
df_w2v = expand_up_down(df_w2v, 'cosine_sim')

100%|██████████| 2556/2556 [01:40<00:00, 25.51it/s]


In [28]:
print_stats(df_w2v)

3961 out of 18822 paragraphs are considered using this method. Thus, 21.04 %

345838 out of 1229678 tokens are considered using this method. Thus, 28.12 %


In [29]:
df_w2v_merge = df_w2v[df_w2v['best_match']].reset_index().groupby(['book', 'chapter', 'bullets'], sort=False)\
.agg({
    'para': lambda p: ' '.join(list(p)),
    'para_num_tokens': sum,
    'bullets_num_tokens': lambda bnt: list(bnt)[0]
}).reset_index(level='bullets')
df_w2v_merge = df_w2v_merge.rename(columns={'para': 'text'})

df_w2v_merge['compression_ratio'] = df_w2v_merge.bullets_num_tokens / df_w2v_merge.para_num_tokens

In [30]:
print_stats_after_merge(df_w2v_merge)

Percentage of paragraphs which are too short to be summarized: 0.00 %

Paragraphs which are too long to fit into the model: 0 paragraphs.
Empty DataFrame
Columns: [bullets, text, para_num_tokens, bullets_num_tokens, compression_ratio]
Index: []


In [31]:
df_w2v_merge[df_w2v_merge['para_num_tokens'] > tokenizer.model_max_length].para_num_tokens.describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: para_num_tokens, dtype: float64

#### Save dataset

In [32]:
df_w2v_merge.to_csv(op+'df.csv')

#### Create train, test, validation

In [33]:
df_w2v_merge = df_w2v_merge.groupby(level=[0, 1], sort=False).agg({
    'bullets': lambda b: list(b),
    'text': lambda t: list(t),
})

In [34]:
df_w2v_merge = df_w2v_merge.sample(frac=1, random_state=config.SEED)
df_w2v_merge['num_bulls'] = df_w2v_merge.bullets.map(len).cumsum()
tot_bulls = df_w2v_merge.num_bulls.iloc[-1]
split1 = np.where(df_w2v_merge.num_bulls > int(tot_bulls*0.8))[0][0]+1
split2 = np.where(df_w2v_merge.num_bulls > int(tot_bulls*0.9))[0][0]+1
print(split1, split2)

361 408


In [35]:
train, val, test =\
    df_w2v_merge.iloc[:split1].explode('bullets'),\
    df_w2v_merge.iloc[split1:split2].explode('bullets'),\
    df_w2v_merge.iloc[split2:].explode('bullets')

train['text'] = df_w2v_merge.iloc[:split1].explode('text')['text']
val['text'] = df_w2v_merge.iloc[split1:split2].explode('text')['text']
test['text'] = df_w2v_merge.iloc[split2:].explode('text')['text']

train.to_csv(op+'train.csv')
val.to_csv(op+'val.csv')
test.to_csv(op+'test.csv')

In [36]:
with open(op+'train.source', 'w') as tr_s,\
    open(op+'train.target', 'w') as tr_t,\
    open(op+'train.index', 'w') as tr_i:
    for idx, row in train[['text', 'bullets']].iterrows():
        tr_i.write(str(idx) + '\n')
        tr_s.write(row.text + '\n')
        tr_t.write(row.bullets + '\n')
        
with open(op+'val.source', 'w') as va_s,\
    open(op+'val.target', 'w') as va_t,\
    open(op+'val.index', 'w') as va_i:
    for idx, row in val[['text', 'bullets']].iterrows():
        va_i.write(str(idx) + '\n')
        va_s.write(row.text + '\n')
        va_t.write(row.bullets + '\n')
        
with open(op+'test.source', 'w') as te_s,\
    open(op+'test.target', 'w') as te_t,\
    open(op+'test.index', 'w') as te_i:
    for idx, row in test[['text', 'bullets']].iterrows():
        te_i.write(str(idx) + '\n')
        te_s.write(row.text + '\n')
        te_t.write(row.bullets + '\n')

## **Doc2Vec Book Level**

#### Create Doc Vectors

In [37]:
op = OUTPUT_PATH + 'd2v/'
if not os.path.exists(op):
    os.makedirs(op)

In [38]:
df_d2v = df.copy()

In [39]:
df_d2v_book = df_d2v.groupby('book', sort=False).agg({
    'para_proc': lambda pp: list(pp),
    'bullets_proc': lambda bp: list(bp)[0]
})

In [40]:
df_d2v_book['corpus'] = df_d2v_book.para_proc + df_d2v_book.bullets_proc
df_d2v_book['tagged_corpus'] = df_d2v_book.corpus.map(lambda c:
    [gensim.models.doc2vec.TaggedDocument(para, [i]) for i, para in enumerate(c)])

In [41]:
df_d2v_book['d2v'] = df_d2v_book.tagged_corpus.progress_map(lambda tc:\
    gensim.models.Doc2Vec(
        tc,
        dm=1, # 1 for PV-DM; otherwise PV-DBOW
        #vector_size=128,
        #window=3,
        #epochs=5,
        min_count=1,
        seed = config.SEED))

100%|██████████| 53/53 [00:36<00:00,  1.44it/s]


#### Explode and preprocess bullets

In [42]:
df_d2v = df_d2v.explode('bullets')

df_d2v['bullets_proc'] = df_d2v.bullets.progress_map(para2words)
df_d2v.bullets_proc = df_d2v.bullets_proc.progress_map(lambda b:
    [w for w in b if w not in stop_words])

100%|██████████| 114277/114277 [00:07<00:00, 14568.97it/s]
100%|██████████| 114277/114277 [00:04<00:00, 23445.69it/s]


In [43]:
df_d2v['para_num_tokens'] = df_d2v.para.progress_map(lambda p: len(tokenizer.tokenize(p)))
df_d2v['bullets_num_tokens'] = df_d2v.bullets.progress_map(lambda b: len(tokenizer.tokenize(b)))

df_d2v['compression_ratio'] = df_d2v.bullets_num_tokens / df_d2v.para_num_tokens

100%|██████████| 114277/114277 [00:52<00:00, 2184.74it/s]
100%|██████████| 114277/114277 [00:43<00:00, 2650.27it/s]


#### Calculate similarity between each couple bullet-para

In [44]:
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a)*np.linalg.norm(b))

def d2v_similarity(r):
    book = r.name[0]
    d2v = df_d2v_book.loc[book, 'd2v']
    dv_para = d2v.infer_vector(r.para_proc)
    dv_bullets = d2v.infer_vector(r.bullets_proc)
    
    return cosine_sim(dv_para, dv_bullets)
    
df_d2v['d2v_sim'] = df_d2v.progress_apply(lambda row: d2v_similarity(row), axis=1)

100%|██████████| 114277/114277 [02:40<00:00, 711.70it/s]


#### Find Best Match and Expand

In [45]:
# find best match bullet-para for each bullet
df_d2v = assign_best_metric_para(df_d2v, 'd2v_sim')

100%|██████████| 2556/2556 [00:01<00:00, 1365.76it/s]


Percentage of paragraphs which are too short to be summarized: 56.57 %


In [46]:
df_d2v_expand = expand_up_down(df_d2v, 'd2v_sim')

100%|██████████| 2556/2556 [01:40<00:00, 25.32it/s]


In [47]:
print_stats(df_d2v)

4105 out of 18822 paragraphs are considered using this method. Thus, 21.81 %

354731 out of 1229678 tokens are considered using this method. Thus, 28.85 %


In [48]:
df_d2v_merge = df_d2v[df_d2v['best_match']].reset_index().groupby(['book', 'chapter', 'bullets'], sort=False)\
.agg({
    'para': lambda p: ' '.join(list(p)),
    'para_num_tokens': sum,
    'bullets_num_tokens': lambda bnt: list(bnt)[0]
}).reset_index(level='bullets')
df_d2v_merge = df_d2v_merge.rename(columns={'para': 'text'})

df_d2v_merge['compression_ratio'] = df_d2v_merge.bullets_num_tokens / df_d2v_merge.para_num_tokens

In [49]:
print_stats_after_merge(df_d2v_merge)

Percentage of paragraphs which are too short to be summarized: 0.00 %

Paragraphs which are too long to fit into the model: 0 paragraphs.
Empty DataFrame
Columns: [bullets, text, para_num_tokens, bullets_num_tokens, compression_ratio]
Index: []


#### Save dataset

In [50]:
df_d2v_merge.to_csv(op+'df.csv')

#### Create train, test, validation

In [51]:
df_d2v_merge = df_d2v_merge.groupby(level=[0, 1], sort=False).agg({
    'bullets': lambda b: list(b),
    'text': lambda t: list(t),
})

In [52]:
df_d2v_merge = df_d2v_merge.sample(frac=1, random_state=config.SEED)
df_d2v_merge['num_bulls'] = df_d2v_merge.bullets.map(len).cumsum()
tot_bulls = df_d2v_merge.num_bulls.iloc[-1]
split1 = np.where(df_d2v_merge.num_bulls > int(tot_bulls*0.8))[0][0]+1
split2 = np.where(df_d2v_merge.num_bulls > int(tot_bulls*0.9))[0][0]+1
print(split1, split2)

361 408


In [53]:
train, val, test =\
    df_d2v_merge.iloc[:split1].explode('bullets'),\
    df_d2v_merge.iloc[split1:split2].explode('bullets'),\
    df_d2v_merge.iloc[split2:].explode('bullets')

train['text'] = df_d2v_merge.iloc[:split1].explode('text')['text']
val['text'] = df_d2v_merge.iloc[split1:split2].explode('text')['text']
test['text'] = df_d2v_merge.iloc[split2:].explode('text')['text']

train.to_csv(op+'train.csv')
val.to_csv(op+'val.csv')
test.to_csv(op+'test.csv')

In [54]:
with open(op+'train.source', 'w') as tr_s,\
    open(op+'train.target', 'w') as tr_t,\
    open(op+'train.index', 'w') as tr_i:
    for idx, row in train[['text', 'bullets']].iterrows():
        tr_i.write(str(idx) + '\n')
        tr_s.write(row.text + '\n')
        tr_t.write(row.bullets + '\n')
        
with open(op+'val.source', 'w') as va_s,\
    open(op+'val.target', 'w') as va_t,\
    open(op+'val.index', 'w') as va_i:
    for idx, row in val[['text', 'bullets']].iterrows():
        va_i.write(str(idx) + '\n')
        va_s.write(row.text + '\n')
        va_t.write(row.bullets + '\n')
        
with open(op+'test.source', 'w') as te_s,\
    open(op+'test.target', 'w') as te_t,\
    open(op+'test.index', 'w') as te_i:
    for idx, row in test[['text', 'bullets']].iterrows():
        te_i.write(str(idx) + '\n')
        te_s.write(row.text + '\n')
        te_t.write(row.bullets + '\n')

## **Sentence-Transformers Book Level**

In [55]:
op = OUTPUT_PATH + 'st/'
if not os.path.exists(op):
    os.makedirs(op)

In [56]:
df_st = df.copy()

#### Create embedding vectors for para

In [57]:
from sentence_transformers import SentenceTransformer

# might want to try 'msmarco-distilbert-base-v2' too
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

df_st['para_enc'] = df_st.para.progress_map(model.encode)

100%|██████████| 18773/18773 [20:25<00:00, 15.32it/s]


#### Explode bullets

In [58]:
df_st = df_st.explode('bullets')

In [59]:
df_st['para_num_tokens'] = df_st.para.progress_map(lambda p: len(tokenizer.tokenize(p)))
df_st['bullets_num_tokens'] = df_st.bullets.progress_map(lambda b: len(tokenizer.tokenize(b)))

df_st['compression_ratio'] = df_st.bullets_num_tokens / df_st.para_num_tokens

100%|██████████| 114277/114277 [00:48<00:00, 2332.50it/s]
100%|██████████| 114277/114277 [00:42<00:00, 2687.26it/s]


#### Create embedding vectors for bullets

In [60]:
bull_to_embed = df_st.groupby(['book', 'chapter'], sort=False).agg({
    'bullets': lambda b: list(set(b))
}).explode('bullets')

bull_to_embed['bullets_enc'] = bull_to_embed.bullets.progress_map(model.encode)

100%|██████████| 2556/2556 [01:50<00:00, 23.21it/s]


#### Calculate similarity between each couple bullet-para

In [61]:
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a)*np.linalg.norm(b))

def sentence_transformers_sim(r):
    book = r.name[0]
    b2e = bull_to_embed.loc[book]
    para_enc = r.para_enc
    bullets_enc = b2e.loc[(b2e.bullets == r.bullets), 'bullets_enc']
    assert len(bullets_enc) == 1
    bullets_enc = bullets_enc[0]
    
    return cosine_sim(para_enc, bullets_enc)
    
df_st['st_sim'] = df_st.progress_apply(sentence_transformers_sim, axis=1)

100%|██████████| 114277/114277 [01:36<00:00, 1180.34it/s]


### Find Best Match and Expand

In [62]:
# find best match bullet-para for each bullet
df_st = assign_best_metric_para(df_st, 'st_sim')

100%|██████████| 2556/2556 [00:01<00:00, 1609.93it/s]


Percentage of paragraphs which are too short to be summarized: 63.50 %


In [63]:
df_st_base = expand_up_down(df_st.copy(), 'st_sim')

100%|██████████| 2556/2556 [01:53<00:00, 22.59it/s]


In [64]:
df_st_th = expand_up_down_numtok_th(df_st.copy(), 'st_sim')

100%|██████████| 2556/2556 [04:58<00:00,  8.56it/s]


In [65]:
print_stats(df_st_base)
print()
print_stats(df_st_th)

4558 out of 18822 paragraphs are considered using this method. Thus, 24.22 %

353137 out of 1229678 tokens are considered using this method. Thus, 28.72 %

11315 out of 18822 paragraphs are considered using this method. Thus, 60.12 %

804751 out of 1229678 tokens are considered using this method. Thus, 65.44 %


### Study Overlaps

##### Functions

In [66]:
def create_overlap_matrix(r):
    num_bulls = len(r.selected_para)
    #assert num_bulls == len(r.bullets)
    overlap_matrix = np.zeros((num_bulls,num_bulls))
    
    def list_overlap(a, b):
        return list( set(a).intersection(set(b)) )
    
    for i in range(num_bulls):
        for j in range(num_bulls):
            if i == j : continue
            num_tok_i = np.sum(r.para_num_tokens[r.selected_para[i]])
            overlap = list_overlap(
                r.selected_para[i], r.selected_para[j])
            num_tok_overlap = np.sum(r.para_num_tokens[overlap])
            assert num_tok_overlap <= num_tok_i
            
            overlap_matrix[i, j] = round(num_tok_overlap/num_tok_i*100, 2)
    
    return overlap_matrix

def find_big_overlap(r, threshold):
    om = r.overlap_matrix
    big_overlap_idx = np.argwhere(om >= threshold)
    big_overlap_idx = set([frozenset(t) for t in big_overlap_idx])
    merged = set()
    to_be_merged = set()
    for idx in big_overlap_idx:
        idx = tuple(idx)
        i, j = idx[0], idx[1]
        if i not in merged and j not in merged:
            to_be_merged.add(idx)
            merged.add(i)
            merged.add(j)
    return to_be_merged

def merge_bullets(r):
    for i, j in r.to_be_merged:
        r.bullets[i] += (' '+r.bullets[j])
        r.bullets_num_tokens[i] += r.bullets_num_tokens[j]
        r.selected_para[i] = np.array(list(set(
            np.concatenate((r.selected_para[i], r.selected_para[j])))))
        
        r.bullets[j] = None
        r.bullets_num_tokens[j] = None
        r.selected_para[j] = None
    r.bullets = [b for b in r.bullets if b is not None]
    r.bullets_num_tokens = [bnt for bnt in r.bullets_num_tokens if bnt is not None]
    r.selected_para = [sp for sp in r.selected_para if sp is not None]

##### Base

In [67]:
df_st_base_overlap = df_st_base.groupby(['book', 'chapter', 'bullets'], sort=False).agg({
    'para': lambda p: list(p),
    'para_num_tokens': lambda pnt: list(pnt),
    'bullets_num_tokens': lambda bnt: list(bnt)[0],
    'best_match': lambda bm: list(bm)
}).reset_index('bullets')
df_st_base_overlap.best_match = df_st_base_overlap.best_match.map(lambda bm: np.where(bm)[0])
df_st_base_overlap = df_st_base_overlap.groupby(['book', 'chapter'], sort=False).agg({
    'bullets': lambda b: list(b),
    'para': lambda p: list(p)[0],
    'para_num_tokens': lambda pnt: list(pnt)[0],
    'bullets_num_tokens': lambda bnt: list(bnt),
    'best_match': lambda bm: list(bm)
}).rename(columns={'best_match': 'selected_para'})
df_st_base_overlap.para_num_tokens = df_st_base_overlap.para_num_tokens.map(np.array)

In [68]:
while True:
    print('\nTotal number of bullets: %d'%(df_st_base_overlap.bullets.map(len).sum()))
    df_st_base_overlap['overlap_matrix'] = df_st_base_overlap.apply(create_overlap_matrix, axis=1)

    df_st_base_overlap['to_be_merged'] = df_st_base_overlap.apply(lambda row: find_big_overlap(row, 90), axis=1)

    num_to_be_merged = df_st_base_overlap.to_be_merged.map(len).sum()
    print('Bullets to be merged: %d'%num_to_be_merged)
    if (num_to_be_merged <= 0) : break

    df_st_base_overlap.apply(merge_bullets, axis=1)


Total number of bullets: 2556
Bullets to be merged: 412

Total number of bullets: 2144
Bullets to be merged: 64

Total number of bullets: 2080
Bullets to be merged: 9

Total number of bullets: 2071
Bullets to be merged: 2

Total number of bullets: 2069
Bullets to be merged: 0


##### Num Tok Threshold

In [69]:
df_st_th_overlap = df_st_th.groupby(['book', 'chapter', 'bullets'], sort=False).agg({
    'para': lambda p: list(p),
    'para_num_tokens': lambda pnt: list(pnt),
    'bullets_num_tokens': lambda bnt: list(bnt)[0],
    'best_match': lambda bm: list(bm)
}).reset_index('bullets')
df_st_th_overlap.best_match = df_st_th_overlap.best_match.map(lambda bm: np.where(bm)[0])
df_st_th_overlap = df_st_th_overlap.groupby(['book', 'chapter'], sort=False).agg({
    'bullets': lambda b: list(b),
    'para': lambda p: list(p)[0],
    'para_num_tokens': lambda pnt: list(pnt)[0],
    'bullets_num_tokens': lambda bnt: list(bnt),
    'best_match': lambda bm: list(bm)
}).rename(columns={'best_match': 'selected_para'})
df_st_th_overlap.para_num_tokens = df_st_th_overlap.para_num_tokens.map(np.array)

In [70]:
while True:
    print('\nTotal number of bullets: %d'%(df_st_th_overlap.bullets.map(len).sum()))
    df_st_th_overlap['overlap_matrix'] = df_st_th_overlap.apply(lambda row: create_overlap_matrix(row), axis=1)
    df_st_th_overlap.overlap_matrix.map(lambda om: np.sum(om > 90)).describe()

    df_st_th_overlap['to_be_merged'] = df_st_th_overlap.apply(lambda row: find_big_overlap(row, 90), axis=1)

    num_to_be_merged = df_st_th_overlap.to_be_merged.map(len).sum()
    print('Bullets to be merged: %d'%num_to_be_merged)
    if (num_to_be_merged <= 0) : break

    df_st_th_overlap.apply(merge_bullets, axis=1)


Total number of bullets: 2556
Bullets to be merged: 532

Total number of bullets: 2024
Bullets to be merged: 118

Total number of bullets: 1906
Bullets to be merged: 3

Total number of bullets: 1903
Bullets to be merged: 0


### Putting Things Together

##### Base

In [71]:
df_st_base_merge =\
    df_st_base[df_st_base['best_match']].reset_index().groupby(['book', 'chapter', 'bullets'], sort=False)\
    .agg({
        'para': lambda p: ' '.join(list(p)),
        'para_num_tokens': sum,
        'bullets_num_tokens': lambda bnt: list(bnt)[0]
    }).reset_index(level='bullets')
df_st_base_merge = df_st_base_merge.rename(columns={'para': 'text'})

df_st_base_merge['compression_ratio'] = df_st_base_merge.bullets_num_tokens / df_st_base_merge.para_num_tokens

In [72]:
df_st_base_merge.bullets_num_tokens.describe()

count    2556.000000
mean       28.862285
std        16.298578
min         2.000000
25%        18.000000
50%        25.000000
75%        36.000000
max       223.000000
Name: bullets_num_tokens, dtype: float64

In [73]:
df_st_base_merge.para_num_tokens.describe()

count    2556.000000
mean      170.105634
std        79.163032
min         8.000000
25%       115.000000
50%       156.000000
75%       212.000000
max       988.000000
Name: para_num_tokens, dtype: float64

In [74]:
print_stats_after_merge(df_st_base_merge)

Percentage of paragraphs which are too short to be summarized: 0.00 %

Paragraphs which are too long to fit into the model: 0 paragraphs.
Empty DataFrame
Columns: [bullets, text, para_num_tokens, bullets_num_tokens, compression_ratio]
Index: []


##### Num Tok Threshold

In [75]:
df_st_th_merge =\
    df_st_th[df_st_th['best_match']].reset_index().groupby(['book', 'chapter', 'bullets'], sort=False)\
    .agg({
        'para': lambda p: ' '.join(list(p)),
        'para_num_tokens': sum,
        'bullets_num_tokens': lambda bnt: list(bnt)[0]
    }).reset_index(level='bullets')
df_st_th_merge = df_st_th_merge.rename(columns={'para': 'text'})

df_st_th_merge['compression_ratio'] =\
    df_st_th_merge.bullets_num_tokens / df_st_th_merge.para_num_tokens

In [76]:
df_st_th_merge.bullets_num_tokens.describe()

count    2556.000000
mean       28.862285
std        16.298578
min         2.000000
25%        18.000000
50%        25.000000
75%        36.000000
max       223.000000
Name: bullets_num_tokens, dtype: float64

In [77]:
df_st_th_merge.para_num_tokens.describe()

count    2556.000000
mean      513.486698
std       223.103295
min        94.000000
25%       339.000000
50%       466.000000
75%       662.000000
max      1134.000000
Name: para_num_tokens, dtype: float64

In [78]:
print_stats_after_merge(df_st_th_merge)

Percentage of paragraphs which are too short to be summarized: 0.86 %

Paragraphs which are too long to fit into the model: 31 paragraphs.
                                                                  bullets  \
book          chapter                                                       
9781908541178 ch_6      Piriformis tenderness is an area of local tend...   
              ch_6      Facet arthritis and trochanteric bursal irrita...   
              ch_6      Radiographic spinal stenosis, facet arthrosis ...   
              ch_8      Surgery is usually effective for refractory sc...   
9781908541406 ch_6      Thrombocytopenia may occur with some inherited...   
              ch_6      In immune thrombocytopenia, bleeding is uncomm...   
              ch_6      Platelet transfusion has limited indications a...   
9781908541420 ch_6      Drugs used in the management of asthma can be ...   
9781908541703 ch_5      Triple assessment - clinical examination plus ...   
              

##### Base No Overlap

In [79]:
#df_st_merge_no_overlap = 
df_st_base_overlap['bp'] = df_st_base_overlap.apply(lambda r:\
    [(b, sp, bnt) for b, sp, bnt in zip(r.bullets, r.selected_para, r.bullets_num_tokens)], axis=1)
df_st_base_over_merge = df_st_base_overlap.explode('bp')
df_st_base_over_merge.bullets = df_st_base_over_merge.bp.map(lambda t: t[0])
df_st_base_over_merge.selected_para = df_st_base_over_merge.bp.map(lambda t: t[1])
df_st_base_over_merge.bullets_num_tokens = df_st_base_over_merge.bp.map(lambda t: t[2])

df_st_base_over_merge['para'] = df_st_base_over_merge.apply(lambda row:\
    ' '.join([p for i, p in enumerate(row.para) if i in row.selected_para]), axis=1)

df_st_base_over_merge['para_num_tokens'] = df_st_base_over_merge.apply(lambda row:\
    sum([p for i, p in enumerate(row.para_num_tokens) if i in row.selected_para]), axis=1)

df_st_base_over_merge = df_st_base_over_merge.drop(
    columns=['overlap_matrix', 'to_be_merged', 'bp', 'selected_para']).rename(columns={'para': 'text'})

df_st_base_over_merge['compression_ratio'] =\
    df_st_base_over_merge.bullets_num_tokens / df_st_base_over_merge.para_num_tokens

In [80]:
df_st_base_over_merge.bullets_num_tokens.describe()

count    2069.000000
mean       35.655872
std        26.291229
min         2.000000
25%        20.000000
50%        29.000000
75%        43.000000
max       361.000000
Name: bullets_num_tokens, dtype: float64

In [81]:
df_st_base_over_merge.para_num_tokens.describe()

count    2069.000000
mean      176.131948
std        81.374665
min         8.000000
25%       120.000000
50%       163.000000
75%       220.000000
max       988.000000
Name: para_num_tokens, dtype: float64

In [82]:
df_st_base_over_merge[df_st_base_over_merge['compression_ratio'] > config.MAX_RATIO]

Unnamed: 0_level_0,Unnamed: 1_level_0,bullets,text,para_num_tokens,bullets_num_tokens,compression_ratio
book,chapter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9781905832729,ch_2,The bladder operates as a low-pressure high-vo...,Continence is maintained by a complex interact...,184,56,0.304348
9781905832729,ch_4,The surgical procedure of choice in women is i...,insertion of an artificial urinary sphincter. ...,164,61,0.371951
9781905832729,ch_7,Blood in the urine can originate from anywhere...,Hematuria can originate from anywhere along th...,96,35,0.364583
9781905832729,ch_8,Recurrent urinary tract infection (UTI) is def...,Recurrent urinary tract infection. Recurrent U...,174,63,0.362069
9781905832729,ch_8,Urinalysis is quick and easy. A positive test ...,"Diagnosis. Urinalysis, using dipstick tests fo...",132,44,0.333333
...,...,...,...,...,...,...
9783318067095,ch9,Verification evaluates the capture and transfe...,Once you have arrived at a construct to measur...,801,282,0.352060
9783318068207,hh-6,A basket trial is a biomarker-driven study in ...,"Traditionally, oncology Phase I clinical trial...",161,56,0.347826
9783318068207,hh-7,"The molecular status of, at least, EGFR, ALK, ...",NGS testing for predictive biomarkers. EGFR ac...,229,92,0.401747
9783318068207,hh-7,PD-L1 expression also needs to be tested in pa...,Programmed death-ligand 1. In addition to the ...,96,38,0.395833


In [83]:
print_stats_after_merge(df_st_base_over_merge)

Percentage of paragraphs which are too short to be summarized: 14.26 %

Paragraphs which are too long to fit into the model: 0 paragraphs.
Empty DataFrame
Columns: [bullets, text, para_num_tokens, bullets_num_tokens, compression_ratio]
Index: []


##### Num Tok Threshold No Overlap

In [84]:
#df_st_merge_no_overlap = 
df_st_th_overlap['bp'] = df_st_th_overlap.apply(lambda r:\
    [(b, sp, bnt) for b, sp, bnt in zip(r.bullets, r.selected_para, r.bullets_num_tokens)], axis=1)
df_st_th_over_merge = df_st_th_overlap.explode('bp')
df_st_th_over_merge.bullets = df_st_th_over_merge.bp.map(lambda t: t[0])
df_st_th_over_merge.selected_para = df_st_th_over_merge.bp.map(lambda t: t[1])
df_st_th_over_merge.bullets_num_tokens = df_st_th_over_merge.bp.map(lambda t: t[2])

df_st_th_over_merge['para'] = df_st_th_over_merge.apply(lambda row:\
    ' '.join([p for i, p in enumerate(row.para) if i in row.selected_para]), axis=1)

df_st_th_over_merge['para_num_tokens'] = df_st_th_over_merge.apply(lambda row:\
    sum([p for i, p in enumerate(row.para_num_tokens) if i in row.selected_para]), axis=1)

df_st_th_over_merge = df_st_th_over_merge.drop(
    columns=['overlap_matrix', 'to_be_merged', 'bp', 'selected_para']).rename(columns={'para': 'text'})

df_st_th_over_merge['compression_ratio'] =\
    df_st_th_over_merge.bullets_num_tokens / df_st_th_over_merge.para_num_tokens

In [85]:
df_st_th_over_merge.bullets_num_tokens.describe()

count    1903.000000
mean       38.766159
std        26.499201
min         3.000000
25%        21.000000
50%        31.000000
75%        48.000000
max       246.000000
Name: bullets_num_tokens, dtype: float64

In [86]:
df_st_th_over_merge.para_num_tokens.describe()

count    1903.000000
mean      522.982659
std       222.127554
min       106.000000
25%       349.000000
50%       473.000000
75%       676.000000
max      1134.000000
Name: para_num_tokens, dtype: float64

In [87]:
df_st_th_over_merge[df_st_th_over_merge['compression_ratio'] > config.MAX_RATIO]

Unnamed: 0_level_0,Unnamed: 1_level_0,bullets,text,para_num_tokens,bullets_num_tokens,compression_ratio
book,chapter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9781905832729,ch_9,Treatment of nocturia is largely on the basis ...,Treatment of nocturia should be on the basis o...,290,74,0.255172
9781908541024,ch_3,Initial symptoms and signs of a brain tumor ar...,Patients with brain tumors typically develop n...,290,73,0.251724
9781908541024,ch_6,Concurrent and adjuvant temozolomide chemother...,Management. Anaplastic oligodendrogliomas have...,532,184,0.345865
9781908541277,ch_4,Epilepsy is the most common serious neurologic...,Epilepsy is the most common serious neurologic...,168,45,0.267857
9781908541277,ch_11,Oral contraceptives containing at least 50 µg ...,"Contraception. Carbamazepine (CBZ), eslicarbaz...",335,84,0.250746
...,...,...,...,...,...,...
9783318066241,ch5,Early liquid peripancreatic collections withou...,An APFC is defined as peripancreatic fluid (a ...,280,84,0.300000
9783318066241,ch5,"In the revised Atlanta classification, acute p...",According to the number of organs involved: th...,259,85,0.328185
9783318067095,ch6,As advances in technology enable digital tools...,"For example, there is a lot of excitement in t...",587,152,0.258944
9783318067095,ch7,"'Effective, unambiguous communication is essen...",Digital biomarkers and clinical outcomes. Are ...,415,152,0.366265


In [88]:
print_stats_after_merge(df_st_th_over_merge)

Percentage of paragraphs which are too short to be summarized: 4.41 %

Paragraphs which are too long to fit into the model: 30 paragraphs.
                                                                  bullets  \
book          chapter                                                       
9781908541178 ch_6      Radiographic spinal stenosis, facet arthrosis ...   
              ch_6      Piriformis tenderness is an area of local tend...   
              ch_6      Facet arthritis and trochanteric bursal irrita...   
              ch_8      Surgery is usually effective for refractory sc...   
9781908541406 ch_6      Thrombocytopenia may occur with some inherited...   
              ch_6      In immune thrombocytopenia, bleeding is uncomm...   
              ch_6      Platelet transfusion has limited indications a...   
9781908541420 ch_6      Drugs used in the management of asthma can be ...   
9781908541703 ch_5      Triple assessment - clinical examination plus ...   
              

### Save dataset

In [89]:
df_st_base_selected_para = df_st_base.groupby(['book', 'chapter', 'para'], sort=False).agg({
    'best_match': lambda b: np.any(list(b))
}).reset_index('para').groupby(['book', 'chapter'], sort=False).agg({
    'para': lambda p: list(p),
    'best_match': lambda b: list(b)
})
df_st_base_selected_para.best_match = df_st_base_selected_para.best_match.map(lambda b: list(np.where(b)[0]))
df_st_base_selected_para.to_csv(op+'df_base_selected_para.csv')

In [90]:
df_st_th_selected_para = df_st_th.groupby(['book', 'chapter', 'para'], sort=False).agg({
    'best_match': lambda b: np.any(list(b))
}).reset_index('para').groupby(['book', 'chapter'], sort=False).agg({
    'para': lambda p: list(p),
    'best_match': lambda b: list(b)
})
df_st_th_selected_para.best_match = df_st_th_selected_para.best_match.map(lambda b: list(np.where(b)[0]))
df_st_th_selected_para.to_csv(op+'df_th_selected_para.csv')

In [91]:
for d, df_merge in zip(
    ['base', 'th', 'base_overlap', 'th_overlap'],
    [df_st_base_merge, df_st_th_merge, df_st_base_over_merge, df_st_th_over_merge]):

    op = OUTPUT_PATH + 'st/'+d+'/'
    if not os.path.exists(op):
        os.makedirs(op)
        
    df_merge.to_csv(op+'df.csv')
        
    df_merge = df_merge.groupby(level=[0, 1], sort=False).agg({
        'bullets': lambda b: list(b),
        'text': lambda t: list(t),
    })
    
    df_merge = df_merge.sample(frac=1, random_state=config.SEED)
    df_merge['num_bulls'] = df_merge.bullets.map(len).cumsum()
    tot_bulls = df_merge.num_bulls.iloc[-1]
    split1 = np.where(df_merge.num_bulls > int(tot_bulls*0.8))[0][0]+1
    split2 = np.where(df_merge.num_bulls > int(tot_bulls*0.9))[0][0]+1
    print(split1, split2)
    
    train, val, test =\
    df_merge.iloc[:split1].explode('bullets'),\
    df_merge.iloc[split1:split2].explode('bullets'),\
    df_merge.iloc[split2:].explode('bullets')

    train['text'] = df_merge.iloc[:split1].explode('text')['text']
    val['text'] = df_merge.iloc[split1:split2].explode('text')['text']
    test['text'] = df_merge.iloc[split2:].explode('text')['text']

    train.to_csv(op+'train.csv')
    val.to_csv(op+'val.csv')
    test.to_csv(op+'test.csv')
    
    with open(op+'train.source', 'w') as tr_s,\
        open(op+'train.target', 'w') as tr_t,\
        open(op+'train.index', 'w') as tr_i:
        for idx, row in train[['text', 'bullets']].iterrows():
            tr_i.write(str(idx) + '\n')
            tr_s.write(row.text + '\n')
            tr_t.write(row.bullets + '\n')
        
    with open(op+'val.source', 'w') as va_s,\
        open(op+'val.target', 'w') as va_t,\
        open(op+'val.index', 'w') as va_i:
        for idx, row in val[['text', 'bullets']].iterrows():
            va_i.write(str(idx) + '\n')
            va_s.write(row.text + '\n')
            va_t.write(row.bullets + '\n')

    with open(op+'test.source', 'w') as te_s,\
        open(op+'test.target', 'w') as te_t,\
        open(op+'test.index', 'w') as te_i:
        for idx, row in test[['text', 'bullets']].iterrows():
            te_i.write(str(idx) + '\n')
            te_s.write(row.text + '\n')
            te_t.write(row.bullets + '\n')

361 408
361 408
362 408
361 408


### **Print Some Examples**

In [92]:
def nice_print(idx, bull, list_text, list_text_num_tok, list_method):
    print(idx)
    print()
    print('Bullet:')
    print(fill(bull, 100))
    print()
    for t, tok, m in zip(list_text, list_text_num_tok, list_method):
        print(m+' (' +str(tok)+'):')
        print(fill(t, 100))
        print()
    
    print(''.join(['#']*100))
    print()

#### W2V vs D2V vs Sentence-Transformers vs Rouge

In [93]:
import random

df_w2v = pd.read_csv(OUTPUT_PATH+'w2v/df.csv').set_index(['book', 'chapter'])
df_d2v = pd.read_csv(OUTPUT_PATH+'d2v/df.csv').set_index(['book', 'chapter'])
df_st = pd.read_csv(OUTPUT_PATH+'st/df.csv').set_index(['book', 'chapter'])
df_rouge = pd.read_csv(magma_dir+'datasets/karger_books_para/'+MODEL+'/df.csv').set_index(['book', 'chapter'])

random.seed(config.SEED)

bullet_examples = random.sample(df_w2v.bullets.tolist(), 10)
print(bullet_examples)

FileNotFoundError: [Errno 2] No such file or directory: '/home/marco/epfl/magma/datasets/karger_books_para_wordembed/pegasus/st/df.csv'

In [None]:
list_method = ['W2V', 'D2V', 'Sentence-Transformers', 'ROUGE']

for bull in bullet_examples:
    idx = df_w2v.loc[df_w2v.bullets == bull].index.tolist()[0]
    
    list_text = [df_w2v.loc[df_w2v.bullets == bull, 'text'].tolist()[0],
        df_d2v.loc[df_d2v.bullets == bull, 'text'].tolist()[0],
        df_st.loc[df_st.bullets == bull, 'text'].tolist()[0],
        df_rouge.loc[df_rouge.bullets == bull, 'text'].tolist()[0]]
    
    list_text_num_tok = [df_w2v.loc[df_w2v.bullets == bull, 'para_num_tokens'].tolist()[0],
        df_d2v.loc[df_d2v.bullets == bull, 'para_num_tokens'].tolist()[0],
        df_st.loc[df_st.bullets == bull, 'para_num_tokens'].tolist()[0],
        df_rouge.loc[df_rouge.bullets == bull, 'para_num_tokens'].tolist()[0]]
    
    nice_print(idx, bull, list_text, list_text_num_tok, list_method)

#### ST Base vs Th vs Base-no-overlap vs Th-no-overlap

In [None]:
import random

df_st_base = pd.read_csv(OUTPUT_PATH+'st/base/df.csv').set_index(['book', 'chapter'])
df_st_th = pd.read_csv(OUTPUT_PATH+'st/th/df.csv').set_index(['book', 'chapter'])
df_st_base_noov = pd.read_csv(OUTPUT_PATH+'st/base_overlap/df.csv').set_index(['book', 'chapter'])
df_st_th_noov = pd.read_csv(OUTPUT_PATH+'st/th_overlap/df.csv').set_index(['book', 'chapter'])

random.seed(config.SEED)

bullet_examples = random.sample(df_st_base.bullets.tolist(), 10)
print(bullet_examples)

In [None]:
list_method = ['BASE', 'THRESHOLD', 'BASE-NOOV', 'THRESHOLD-NOOV']

for bull in bullet_examples:
    idx = df_st_base.loc[df_st_base.bullets == bull].index.tolist()[0]
    
    list_text = [df_st_base.loc[df_st_base.bullets == bull, 'text'].tolist()[0],
        df_st_th.loc[df_st_th.bullets == bull, 'text'].tolist()[0],
        df_st_base_noov.loc[df_st_base_noov.bullets == bull, 'text'].tolist()[0],
        df_st_th_noov.loc[df_st_th_noov.bullets == bull, 'text'].tolist()[0]]
    
    list_text_num_tok = [df_st_base.loc[df_st_base.bullets == bull, 'para_num_tokens'].tolist()[0],
        df_st_th.loc[df_st_th.bullets == bull, 'para_num_tokens'].tolist()[0],
        df_st_base_noov.loc[df_st_base_noov.bullets == bull, 'para_num_tokens'].tolist()[0],
        df_st_th_noov.loc[df_st_th_noov.bullets == bull, 'para_num_tokens'].tolist()[0]]
    
    nice_print(idx, bull, list_text, list_text_num_tok, list_method)