#### AWS

In [None]:
magma_dir = '/home/ubuntu/magma/'
bucket_dir = '/home/ubuntu/s3/'
transformers_dir = '/home/ubuntu/transformers/'
cache_dir = bucket_dir+'.cache/'

### **Config**

In [2]:
import os
import sys

sys.path.insert(0, '/home/marco/epfl/magma/')
import config

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
MODEL = 'bart'
MODELS = {}

In [4]:
# Dataset path
data_dir = config.MAGMA_DIR + 'datasets/bullet_paragraph_rouge/'+MODEL+'/'

#### AWS

In [None]:
data_dir = bucket_dir + 'datasets/karger_books_para_rouge/'+MODEL+'/'

### **Init**

In [5]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import re
import pandas as pd
from tqdm import tqdm
from textwrap import fill
tqdm.pandas()

### **Function Definition**

##### Import Model and Tok

In [13]:
def import_model_tok(model_name_or_path, verbose=False):
    global MODELS

    if model_name_or_path in MODELS.keys():
        if verbose : print('[+] model already present in cache\n')
        return MODELS[model_name_or_path]
    if verbose : print('[*] importing the model\n')

    if 'bart' in MODEL:
        from transformers import BartForConditionalGeneration, BartTokenizer

        model = BartForConditionalGeneration.from_pretrained(model_name_or_path)
        tokenizer = BartTokenizer.from_pretrained(model_name_or_path)
    elif 'pegasus' in MODEL:
        from transformers import PegasusForConditionalGeneration, PegasusTokenizer

        model = PegasusForConditionalGeneration.from_pretrained(model_name_or_path)
        tokenizer = PegasusTokenizer.from_pretrained(model_name_or_path)

    if verbose : print(model.config)
    MODELS[model_name_or_path] = model, tokenizer
    if verbose : print('[+] the model is now present in cache\n')
    return MODELS[model_name_or_path]

##### Print Examples

In [7]:
def print_examples(model_name_list, df, n_examples=10):
    
    df_examples = df.sample(n_examples, axis='index', random_state=config.SEED)
    
    for idx, row in df_examples.iterrows():
        print(idx)
        print(fill(row.text, 100))
        print()
        for model_name in model_name_list:
            model, tokenizer = import_model_tok(model_name)
            model = model.to(device)
            
            summ_enc = model.generate(
                tokenizer.encode(row.text, return_tensors='pt').to(device),
                min_length = config.ONE_BULLET_MIN_LEN,
                max_length = config.ONE_BULLET_MAX_LEN,
                length_penalty = config.LENGTH_PENALTY,
                num_beams = config.NUM_BEAMS,
                no_repeat_ngram_size = config.NO_REPEAT_NGRAM_SIZE,
                early_stopping = True)[0]
            summ_num_tok = len(summ_enc)
            summ = tokenizer.decode(summ_enc, skip_special_tokens=True)

            print('Prediction\n%s (%d tok):\n'%(model_name, summ_num_tok))
            print(fill(summ, 100))
            print()
            
        print('Reference:')
        print(fill(row.bullets, 100))
        print()
        print(''.join(['#']*100))
        print()

## **Karger Books Para**

In [10]:
df_train = pd.read_csv(data_dir + 'train.csv').set_index(['book', 'chapter'])
df_val = pd.read_csv(data_dir + 'val.csv').set_index(['book', 'chapter'])
df_test = pd.read_csv(data_dir + 'test.csv').set_index(['book', 'chapter'])

### **Print and Summarization**

##### Print Train Examples

In [14]:
print_examples([
    'sshleifer/distilbart-cnn-12-6'],
    df_train)

[+] model already present in cache

(9781912776696, 'hh-5')
Kaplan-Meier survival curves are often used to compare the data between two groups of subjects.
Figure 2.2 shows Kaplan-Meier curves for OS in a randomized study of patients with human epidermal
growth factor receptor 2 (HER2)-positive metastatic breast cancer treated either with or without
trastuzumab. The Kaplan-Meier curve steps down at time points at which deaths occur, while censored
observations are denoted by notches on the curve. In this study, the follow-up period ranged from 3
months to 74 months. The Kaplan-Meier curve plots the probability of being event free over time,
with these probabilities being estimated from the data in the study. Note that the curve for
patients who received trastuzumab is consistently above the curve for those who did not receive
trastuzumab, indicating a higher survival probability in that group.

Prediction (47 tokens):
 Figure 2.2 shows Kaplan-Meier curves for OS in a randomized study o

(9781910797662, 'ch03')
Anorexia, nausea, weight loss and malaise may result from renal failure due to bilateral ureteric
obstruction, or from the systemic effects of the tumor itself. Bone pain or pathological fractures
may result from skeletal metastases (Figure 3.3); the pain is unrelieved by rest and can be severe.
Anemia and hypercalcemia may occur as metabolic complications of advanced disease; leukocytosis is
occasionally associated with the elaboration of colony-stimulating factors by the tumor. Headache or
disordered thought processes are uncommon as a presenting feature but may indicate underlying brain
metastases or carcinomatous meningitis.

Prediction (57 tokens):
 Anorexia, nausea, weight loss and malaise may result from renal failure due to bilateral ureteric
obstruction. Bone pain or pathological fractures may result from skeletal metastases. Anemia and
hypercalcemia may occur as metabolic complications of advanced disease.

Reference:
Recurrent infections may indicate 

##### Print Val Examples

In [16]:
print_examples([
    'sshleifer/distilbart-cnn-12-6'],
    df_val)

[+] model already present in cache

(9781908541178, 'ch_6')
Many anatomic factors can play a primary or a secondary role in the development and progression of
low back pain syndromes. The most common causes of low back pain are mechanical in origin.
Mechanical disorders of the lumbar spine are related to injury, overuse or deformity of a spinal
structure. The most important traumatic factors in low back pain relate to soft tissue structures.
Precise identification of the injured tissue, and the role of that injury in the consequent pain and
dysfunction, can be frustratingly difficult. Problems occur in muscles, intervertebral discs, facet
joints, ligaments or spinal nerves. Aging causes modification of these structures over time, and
different parts of the spine tend to be at greater risk for change or injury during different
decades of life. Early in life, muscle injuries are more frequent, while joint problems occur in the
sixth decade of life (Table 3.1).

Prediction (46 tokens):
 M

(9781910797815, 'chp9')
There are no specific diagnostic criteria for MPN-BP. The principal criterion is the same as for de
novo acute myeloid leukemia (AML): more than 20% blasts in bone marrow or peripheral blood.
Persistence is also important if only peripheral blood blasts are considered. Importantly,
acceleration or progression of myelofibrosis is usually apparent before the development of overt
leukemia. Signs of this include.

Prediction (47 tokens):
 There are no specific diagnostic criteria for MPN-BP. The principal criterion is the same as for de
novo acute myeloid leukemia (AML): more than 20% blasts in bone marrow or peripheral blood.

Reference:
Signs of accelerating myelofibrosis may precede the development of overt leukemia.

####################################################################################################

(9783318068207, 'hh-7')
Melanoma is a highly mutated malignancy, with mutations documented in all subtypes (Table 5.3)., The
KIT mutation is associ

##### Print Test Examples

In [18]:
print_examples([
    'sshleifer/distilbart-cnn-12-6'],
    df_test)

[+] model already present in cache

(9781908541277, 'ch_8')
After a hiatus of nearly 20 years, 16 new AEDs and two devices - the vagus nerve stimulator and deep
brain stimulator - have received licenses for the adjunctive treatment of refractory epilepsy, the
last so far only in Europe. Gabapentin (GBP), lacosamide (LCM), LTG, levetiracetam (LEV),
oxcarbazepine (OXC), pregabalin (PGB), tiagabine (TGB), TPM and zonisamide (ZNS) are widely
available for partial seizures. Rufinamide (RFN) has been licensed in Europe and the USA for
adjunctive treatment of seizures in Lennox-Gastaut syndrome. Eslicarbazepine acetate (ESL) is
licensed in Europe as adjunctive treatment for partial seizures with or without secondary
generalization. Retigabine (RTG; ezogabine in the USA) and perampanel (PER) have recently been
approved for use in Europe and the USA for the same indication. After 20 years of global experience,
vigabatrin (VGB) has been approved in the USA for the treatment of infantile spasms a

(9781910797211, 'ch04')
The study started in 2000 with a median follow-up of 3 years and maximum of 7 years. During this
time the dopamine agonists varied, including ergot agonists that are no longer in widespread use.
The randomization to levodopa or alternative regimens was at the discretion of the clinician and a
large part of the study was conducted in centers for the care of the elderly. The findings suggest
relatively small but persistent benefits of starting therapy with levodopa rather than the listed
alternatives. Interestingly, initial treatment with a MAOB inhibitor appeared to be at least as
effective as treatment with a dopamine agonist. Patients assigned to the levodopa arm scored, on
average, 1.8 points (95% CI 0.5-3.0) higher on the Parkinson's Disease Questionnaire-39 (PDQ-39)
mobility subscale than those assigned to levodopa-sparing therapies; however, the effects on non-
motor aspects of the scale are unclear. A cost-utility analysis of the study will be reported
sep