#### AWS

In [2]:
magma_dir = '/home/ubuntu/magma/'
bucket_dir = '/home/ubuntu/s3/'
transformers_dir = '/home/ubuntu/transformers/'
cache_dir = bucket_dir+'.cache/'

### **Config**

In [3]:
import os
import sys

sys.path.insert(0, '/home/marco/epfl/magma/')
import config

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
MODEL = 'pegasus'
MODELS = {}

In [5]:
# Dataset path
data_dir = config.MAGMA_DIR + 'datasets/karger_books_para_wordembed/'+MODEL+'/'

#### AWS

In [6]:
data_dir = bucket_dir + 'datasets/karger_books_para_wordembed/'+MODEL+'/'

### **Init**

In [7]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import re
import pandas as pd
from tqdm import tqdm
from textwrap import fill
tqdm.pandas()

### **Function Definition**

##### Import Model and Tok

In [9]:
def import_model_tok(model_name_or_path, verbose=False):
    global MODELS

    if model_name_or_path in MODELS.keys():
        if verbose : print('[+] model already present in cache\n')
        return MODELS[model_name_or_path]
    if verbose : print('[*] importing the model\n')
    
    from transformers import PegasusForConditionalGeneration, PegasusTokenizer
    model = PegasusForConditionalGeneration.from_pretrained(model_name_or_path, cache_dir=cache_dir)
    tokenizer = PegasusTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)

    if verbose : print(model.config)
    MODELS[model_name_or_path] = model, tokenizer
    if verbose : print('[+] the model is now present in cache\n')
    return MODELS[model_name_or_path]

##### Print Examples

In [11]:
def print_examples(model_name_list, df, n_examples=10):
    
    df_examples = df.sample(n_examples, axis='index', random_state=config.SEED)
    
    for idx, row in df_examples.iterrows():
        print(idx)
        print(fill(row.text, 100))
        print()
        print('Reference:')
        print(fill(row.bullets, 100))
        print()
        for model_name in model_name_list:
            model, tokenizer = import_model_tok(model_name)
            model = model.to(device)
            
            summ_enc_list = model.generate(
                tokenizer.encode(row.text, return_tensors='pt').to(device),
                min_length = config.ONE_BULLET_MIN_LEN,
                max_length = config.ONE_BULLET_MAX_LEN,
                length_penalty = config.LENGTH_PENALTY,
                num_beams = config.NUM_BEAMS,
                no_repeat_ngram_size = config.NO_REPEAT_NGRAM_SIZE,
                num_return_sequences = config.NUM_BEAMS,
                early_stopping = True)
            
            for summ_enc in summ_enc_list:
                summ_num_tok = len(tokenizer.convert_ids_to_tokens(summ_enc, skip_special_tokens=True))
                summ = tokenizer.decode(summ_enc, skip_special_tokens=True)

                rouge, sent_trans, w2v = evaluate_prediction(summ, row.bullets)
                
                print('%s (%d tok):'%(model_name, summ_num_tok))
                print('R1\tp: %.2f \tr: %.2f \tf: %.2f\nR2\tp: %.2f \tr: %.2f \tf: %.2f\nRL\tp: %.2f \tr: %.2f \tf: %.2f'%(
                rouge['rouge1_precision'], rouge['rouge1_recall'], rouge['rouge1_fmeasure'],
                rouge['rouge2_precision'], rouge['rouge2_recall'], rouge['rouge2_fmeasure'],
                rouge['rougeL_precision'], rouge['rougeL_recall'], rouge['rougeL_fmeasure']))
                print('ST\t%.2f\t\tW2V\t%.2f'%(sent_trans, w2v))
                print()
                print(fill(summ, 100))
                print()
            print()
            
        print(''.join(['#']*100))
        print()

## **Sentence-Transformers**

In [14]:
data_dir_st = data_dir + 'st/base/'

In [15]:
df_train = pd.read_csv(data_dir_st + 'train.csv').set_index(['book', 'chapter'])
df_val = pd.read_csv(data_dir_st + 'val.csv').set_index(['book', 'chapter'])
df_test = pd.read_csv(data_dir_st + 'test.csv').set_index(['book', 'chapter'])

### **Print and Summarization**

In [None]:
from transformers import AutoConfig
model_config = AutoConfig.from_pretrained(model_name_or_path)
model_config.min_length = config.ONE_BULLET_MIN_LEN
model_config.max_length = config.ONE_BULLET_MAX_LEN

model_config_dir = bucket_dir+'fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/'
model_config.save_pretrained(model_config_dir)

#### Print Train Examples

In [16]:
print_examples([
    'google/pegasus-large',
    bucket_dir+'fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/'],
    df_train)

(9781912776696, 'hh-5')
Kaplan-Meier curves provide a way of plotting the distribution of a time-to-event endpoint. In
Figure 2.1 the dashed line shows that at 24 months' follow-up the estimated survival probability is
54%. Kaplan-Meier survival curves are often used to compare the data between two groups of subjects.
Figure 2.2 shows Kaplan-Meier curves for OS in a randomized study of patients with human epidermal
growth factor receptor 2 (HER2)-positive metastatic breast cancer treated either with or without
trastuzumab. The Kaplan-Meier curve steps down at time points at which deaths occur, while censored
observations are denoted by notches on the curve. In this study, the follow-up period ranged from 3
months to 74 months.

Reference:
Kaplan-Meier curves plot the probability of being event free over time. The curves from different
treatment groups can be plotted against each other to show the differences in outcome.

google/pegasus-large (41 tok):
R1	p: 15.15 	r: 16.67 	f: 15.87
R2

google/pegasus-large (22 tok):
R1	p: 27.78 	r: 20.83 	f: 23.81
R2	p: 0.00 	r: 0.00 	f: 0.00
RL	p: 22.22 	r: 16.67 	f: 19.05
ST	69.64		W2V	22.20

Postamputation persistent pain is a special case of PPSP because large nerves are deliberately cut
in all patients.

google/pegasus-large (48 tok):
R1	p: 19.05 	r: 33.33 	f: 24.24
R2	p: 0.00 	r: 0.00 	f: 0.00
RL	p: 11.90 	r: 20.83 	f: 15.15
ST	64.00		W2V	34.45

Postamputation persistent pain is a special case of PPSP because large nerves are deliberately cut
in all patients. A surprising omission in studies of amputation pain is information about how the
nerves amputated are managed (id est clean cut or ligature tied).

google/pegasus-large (26 tok):
R1	p: 20.83 	r: 20.83 	f: 20.83
R2	p: 0.00 	r: 0.00 	f: 0.00
RL	p: 12.50 	r: 12.50 	f: 12.50
ST	40.35		W2V	33.53

A surprising omission in studies of amputation pain is information about how the nerves amputated
are managed (id est clean cut or ligature tied).

google/pegasus-large (48 tok):
R1	p:

google/pegasus-large (25 tok):
R1	p: 10.00 	r: 13.33 	f: 11.43
R2	p: 0.00 	r: 0.00 	f: 0.00
RL	p: 10.00 	r: 13.33 	f: 11.43
ST	48.54		W2V	26.39

Patients often complain not only of spontaneous pain, but also of pain from stimuli that are not
normally painful (allodynia).

google/pegasus-large (23 tok):
R1	p: 31.82 	r: 46.67 	f: 37.84
R2	p: 4.76 	r: 7.14 	f: 5.71
RL	p: 22.73 	r: 33.33 	f: 27.03
ST	53.86		W2V	56.82

Central to the understanding of clinical pain is the concept that pain may be present without an
obvious peripheral source or cause.

google/pegasus-large (37 tok):
R1	p: 10.00 	r: 20.00 	f: 13.33
R2	p: 0.00 	r: 0.00 	f: 0.00
RL	p: 6.67 	r: 13.33 	f: 8.89
ST	49.00		W2V	23.41

Patients often complain not only of spontaneous pain, but also of pain from stimuli that are not
normally painful (allodynia). For example, a light touch may be described as painful.

google/pegasus-large (25 tok):
R1	p: 10.00 	r: 13.33 	f: 11.43
R2	p: 0.00 	r: 0.00 	f: 0.00
RL	p: 10.00 	r: 13.33 	f: 11.

/home/ubuntu/s3/fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/ (34 tok):
R1	p: 13.64 	r: 7.69 	f: 9.84
R2	p: 0.00 	r: 0.00 	f: 0.00
RL	p: 9.09 	r: 5.13 	f: 6.56
ST	55.56		W2V	12.76

Non-small-cell lung cancer (NSCLC) that lacks clear differentiation by morphology and
immunohistochemistry is classified as 'NSCLC, not otherwise specified' (NOS).

/home/ubuntu/s3/fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/ (32 tok):
R1	p: 14.29 	r: 7.69 	f: 10.00
R2	p: 0.00 	r: 0.00 	f: 0.00
RL	p: 9.52 	r: 5.13 	f: 6.67
ST	53.94		W2V	14.49

Non-small-cell lung cancer (NSCLC) that lacks clear differentiation by morphology and
immunohistochemistry is classified as 'NSCLC, not otherwise specified'.

/home/ubuntu/s3/fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/ (33 tok):
R1	p: 13.64 	r: 7.69 	f: 9.84
R2	p: 0.00 	r: 0.00 	f: 0.00
RL	p: 9.09 	r: 5.13 	f: 6.56
ST	55.81		W2V	12.76

Non-small-cell lung cancer (NSCLC) that lacks clear differentiatio

google/pegasus-large (29 tok):
R1	p: 30.43 	r: 20.00 	f: 24.14
R2	p: 0.00 	r: 0.00 	f: 0.00
RL	p: 21.74 	r: 14.29 	f: 17.24
ST	60.04		W2V	37.68

Ultimately, therapies that successfully promote remyelination and repair may be applicable across
the spectrum of MS subtypes, including progressive forms of the disease.

google/pegasus-large (40 tok):
R1	p: 35.48 	r: 31.43 	f: 33.33
R2	p: 13.33 	r: 11.76 	f: 12.50
RL	p: 22.58 	r: 20.00 	f: 21.21
ST	60.80		W2V	50.06

Initiated in 2013, the first multinational Phase II trial of anti-LINGO-1 antibodies in patients
with acute optic neuritis has shown an effect on promoting nerve repair following optic neuritis.

google/pegasus-large (40 tok):
R1	p: 20.59 	r: 20.00 	f: 20.29
R2	p: 3.03 	r: 2.94 	f: 2.99
RL	p: 14.71 	r: 14.29 	f: 14.49
ST	40.60		W2V	31.20

Mastinib has completed a positive Phase IIb study in patients with primary and secondary progressive
MS, the results of which are reported to include improvement in the Multiple Sclerosis Functi

#### Print Val Examples

In [17]:
print_examples([
    'google/pegasus-large',
    bucket_dir+'fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/'],
    df_val)

(9781908541178, 'ch_6')
Many anatomic factors can play a primary or a secondary role in the development and progression of
low back pain syndromes. The most common causes of low back pain are mechanical in origin.
Mechanical disorders of the lumbar spine are related to injury, overuse or deformity of a spinal
structure. The most important traumatic factors in low back pain relate to soft tissue structures.
Precise identification of the injured tissue, and the role of that injury in the consequent pain and
dysfunction, can be frustratingly difficult. Problems occur in muscles, intervertebral discs, facet
joints, ligaments or spinal nerves. Aging causes modification of these structures over time, and
different parts of the spine tend to be at greater risk for change or injury during different
decades of life. Early in life, muscle injuries are more frequent, while joint problems occur in the
sixth decade of life (Table 3.1).

Reference:
Mechanical disorders - muscle strain, spondylolisth

google/pegasus-large (52 tok):
R1	p: 37.21 	r: 88.89 	f: 52.46
R2	p: 33.33 	r: 82.35 	f: 47.46
RL	p: 37.21 	r: 88.89 	f: 52.46
ST	73.95		W2V	61.40

Both an open-label and a triple-blind RCT have demonstrated that fluid resuscitation using lactated
Ringer's solution (a balanced salt solution) is associated with a decreased inflammatory response
when compared with fluid resuscitation with normal saline (with a high chloride content).

google/pegasus-large (50 tok):
R1	p: 33.33 	r: 77.78 	f: 46.67
R2	p: 24.39 	r: 58.82 	f: 34.48
RL	p: 30.95 	r: 72.22 	f: 43.33
ST	60.66		W2V	51.90

In studies addressing other clinical scenarios, balanced fluids such as lactated Ringer's solution
seem to be associated with a decreased need for blood products and a lower incidence of renal
replacement therapy, hyperkalemia and postoperative infections when compared with normal saline.

google/pegasus-large (53 tok):
R1	p: 36.36 	r: 88.89 	f: 51.61
R2	p: 32.56 	r: 82.35 	f: 46.67
RL	p: 36.36 	r: 88.89 	f: 51.

google/pegasus-large (26 tok):
R1	p: 30.43 	r: 14.00 	f: 19.18
R2	p: 9.09 	r: 4.08 	f: 5.63
RL	p: 26.09 	r: 12.00 	f: 16.44
ST	75.38		W2V	33.60

As a result, tumors with high somatic mutation rates may be more susceptible to immuno-oncology
therapies than those with lower mutation rates.

google/pegasus-large (65 tok):
R1	p: 28.07 	r: 32.00 	f: 29.91
R2	p: 7.14 	r: 8.16 	f: 7.62
RL	p: 15.79 	r: 18.00 	f: 16.82
ST	74.99		W2V	49.03

As a result, tumors with high somatic mutation rates may be more susceptible to immuno-oncology
therapies than those with lower mutation rates. Somatic mutation rates differ markedly, both between
tumor types and within an individual tumor type: the rate may vary more than 1000-fold between
tumors with the highest and lowest rates (Figure 3.2).

google/pegasus-large (39 tok):
R1	p: 38.24 	r: 26.00 	f: 30.95
R2	p: 12.12 	r: 8.16 	f: 9.76
RL	p: 20.59 	r: 14.00 	f: 16.67
ST	65.98		W2V	43.71

Somatic mutation rates differ markedly, both between tumor types and wi

/home/ubuntu/s3/fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/ (12 tok):
R1	p: 0.00 	r: 0.00 	f: 0.00
R2	p: 0.00 	r: 0.00 	f: 0.00
RL	p: 0.00 	r: 0.00 	f: 0.00
ST	7.89		W2V	-9.79

There are no specific diagnostic criteria for MPN-BP.

/home/ubuntu/s3/fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/ (12 tok):
R1	p: 0.00 	r: 0.00 	f: 0.00
R2	p: 0.00 	r: 0.00 	f: 0.00
RL	p: 0.00 	r: 0.00 	f: 0.00
ST	7.77		W2V	-1.59

There is no specific diagnostic criteria for MPN-BP.

/home/ubuntu/s3/fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/ (14 tok):
R1	p: 0.00 	r: 0.00 	f: 0.00
R2	p: 0.00 	r: 0.00 	f: 0.00
RL	p: 0.00 	r: 0.00 	f: 0.00
ST	24.43		W2V	4.61

There are no specific diagnostic criteria for myeloproliferative neoplasms-BP.

/home/ubuntu/s3/fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/ (14 tok):
R1	p: 0.00 	r: 0.00 	f: 0.00
R2	p: 0.00 	r: 0.00 	f: 0.00
RL	p: 0.00 	r: 0.00 	f: 0.00
ST	24.47		W2V	13.10

There 

/home/ubuntu/s3/fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/ (27 tok):
R1	p: 27.27 	r: 42.86 	f: 33.33
R2	p: 19.05 	r: 30.77 	f: 23.53
RL	p: 22.73 	r: 35.71 	f: 27.78
ST	71.87		W2V	38.48

Treatment to reduce cardiovascular risk factors, and promotion of a generally healthy lifestyle, may
be sufficient intervention for very-low-risk patients.

/home/ubuntu/s3/fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/ (33 tok):
R1	p: 10.00 	r: 14.29 	f: 11.76
R2	p: 5.26 	r: 7.69 	f: 6.25
RL	p: 10.00 	r: 14.29 	f: 11.76
ST	33.36		W2V	17.57

Low-dose aspirin has been shown to reduce both microvascular symptoms (exempli gratia
erythromelalgia) and transient neurological and visual disturbances.

/home/ubuntu/s3/fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/ (24 tok):
R1	p: 14.29 	r: 21.43 	f: 17.14
R2	p: 5.00 	r: 7.69 	f: 6.06
RL	p: 9.52 	r: 14.29 	f: 11.43
ST	38.64		W2V	26.16

Low-dose aspirin has been shown to reduce both microvascular 

#### Print Test Examples

In [18]:
print_examples([
    'google/pegasus-large',
    bucket_dir+'fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/'],
    df_test)

(9781908541277, 'ch_8')
Indications. OXC has a similar spectrum of efficacy to CBZ against partial and tonic-clonic
seizures. It tends to be better tolerated than CBZ with fewer neurotoxic side effects. Dosage. The
recommended starting dose for OXC in adults is 150-600 mg daily in two doses. The dose can be
titrated upwards as clinically indicated to 3000-4000 mg daily. A starting dose of 5 mg/kg daily in
children over 3 years of age can be prescribed, increasing gradually to a maintenance dose of about
30 mg/kg daily. Patients already on CBZ may be switched immediately to OXC using a dosage ratio of
1.5 OXC to 1 CBZ. Particular care in immediate switching needs to be taken when the daily CBZ dose
exceeds 1200 mg. Plasma concentrations of the clinically active metabolite of OXC increase linearly
with dose. No studies, however, have attempted to relate elevated plasma levels to efficacy or
toxicity.

Reference:
This wider choice of AEDs permits pharmacological treatment to be better mat

google/pegasus-large (54 tok):
R1	p: 25.53 	r: 63.16 	f: 36.36
R2	p: 17.39 	r: 44.44 	f: 25.00
RL	p: 25.53 	r: 63.16 	f: 36.36
ST	78.01		W2V	55.99

The variability in how PK deficiency manifests is thought to reflect the heterogeneity of these
causative mutations, as well as the fact that most individuals with PK deficiency will be compound
heterozygotes (id est they will have a different mutation in each copy of the PKLR gene).

google/pegasus-large (74 tok):
R1	p: 18.46 	r: 63.16 	f: 28.57
R2	p: 14.06 	r: 50.00 	f: 21.95
RL	p: 18.46 	r: 63.16 	f: 28.57
ST	79.25		W2V	55.93

As discussed in Chapter 1, mutations in the PKLR gene on chromosome 1 are responsible for PK
deficiency. The variability in how PK deficiency manifests is thought to reflect the heterogeneity
of these causative mutations, as well as the fact that most individuals with PK deficiency will be
compound heterozygotes (id est they will have a different mutation in each copy of the PKLR gene).

google/pegasus-large (74 to

/home/ubuntu/s3/fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/ (51 tok):
R1	p: 5.26 	r: 28.57 	f: 8.89
R2	p: 0.00 	r: 0.00 	f: 0.00
RL	p: 5.26 	r: 28.57 	f: 8.89
ST	56.47		W2V	6.21

Overweight/obesity as a whole predisposes to numerous cardiac complications such as coronary heart
disease (CHD), atrial fibrillation, heart failure and sudden cardiac death as a result of
abnormalities in blood glucose, lipids, blood pressure, coagulation and inflammation.

/home/ubuntu/s3/fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/ (40 tok):
R1	p: 7.14 	r: 28.57 	f: 11.43
R2	p: 0.00 	r: 0.00 	f: 0.00
RL	p: 7.14 	r: 28.57 	f: 11.43
ST	53.07		W2V	6.14

Overweight/obesity as a whole predisposes to, or is associated with, numerous cardiac complications
such as coronary heart disease (CHD), atrial fibrillation, heart failure and sudden cardiac death.

/home/ubuntu/s3/fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/ (37 tok):
R1	p: 7.41 	r: 28.57 	

/home/ubuntu/s3/fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/ (47 tok):
R1	p: 76.19 	r: 53.33 	f: 62.75
R2	p: 34.15 	r: 23.73 	f: 28.00
RL	p: 54.76 	r: 38.33 	f: 45.10
ST	84.80		W2V	73.95

The results of the PD Med study suggest relatively small but persistent benefits of starting therapy
with levodopa rather than the listed alternatives. Initial treatment with a MAOB inhibitor appeared
to be at least as effective as treatment with a dopamine agonist.

/home/ubuntu/s3/fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/ (47 tok):
R1	p: 76.19 	r: 53.33 	f: 62.75
R2	p: 34.15 	r: 23.73 	f: 28.00
RL	p: 54.76 	r: 38.33 	f: 45.10
ST	85.66		W2V	74.83

The results of the PD Med study suggest relatively small but persistent benefits of starting therapy
with levodopa rather than the listed alternatives. Initial treatment with a MAOB inhibitor appears
to be at least as effective as treatment with a dopamine agonist.

/home/ubuntu/s3/fine-tuning/ft_pegasus_para

/home/ubuntu/s3/fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/ (28 tok):
R1	p: 65.22 	r: 53.57 	f: 58.82
R2	p: 54.55 	r: 44.44 	f: 48.98
RL	p: 56.52 	r: 46.43 	f: 50.98
ST	82.05		W2V	80.13

Gallbladder cancer is the fifth most common gastrointestinal (GI) cancer in the USA and is the most
common GI cancer in Native Americans.

/home/ubuntu/s3/fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/ (28 tok):
R1	p: 65.22 	r: 53.57 	f: 58.82
R2	p: 45.45 	r: 37.04 	f: 40.82
RL	p: 52.17 	r: 42.86 	f: 47.06
ST	78.45		W2V	78.88

Gallbladder carcinoma is the fifth most common gastrointestinal (GI) cancer in the USA and is the
most common GI cancer in Native Americans.

/home/ubuntu/s3/fine-tuning/ft_pegasus_para_wordembed_gas64_lr5e-05/checkpoint-686/ (27 tok):
R1	p: 68.18 	r: 53.57 	f: 60.00
R2	p: 57.14 	r: 44.44 	f: 50.00
RL	p: 59.09 	r: 46.43 	f: 52.00
ST	82.14		W2V	79.41

Gallbladder cancer is the fifth most common gastrointestinal (GI) cancer in the USA an