#### For Colab

In [49]:
"""
function ClickConnect(){
    console.log("Working");
    document.querySelector("colab-toolbar-button").click() 
}
var i = setInterval(ClickConnect, 900000)
clearInterval(i)
"""

'\nfunction ClickConnect(){\n    console.log("Working");\n    document.querySelector("colab-toolbar-button").click() \n}\nvar i = setInterval(ClickConnect, 900000)\nclearInterval(i)\n'

In [50]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [51]:
drive_dir = '/content/drive/My Drive/MAGMA: Summarization/'

#### Install Libraries

In [20]:
!pip install transformers==4.1.1
!pip install -U sentencepiece!=0.1.92
!pip install -U datasets
!pip install rouge_score

Requirement already up-to-date: sentencepiece!=0.1.92 in /usr/local/lib/python3.6/dist-packages (0.1.95)
Requirement already up-to-date: datasets in /usr/local/lib/python3.6/dist-packages (1.2.0)


### **Config**

In [52]:
import os
import sys

sys.path.insert(0, drive_dir)
import config

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [53]:
MODEL = 'bart'
MODELS = {}

In [54]:
# Output path
if 'bart' in MODEL:
    data_dir = '/content/drive/My Drive/MAGMA: Summarization/datasets/chunk_chapter/assign_bullets/bart/'
    
elif 'pegasus' in MODEL:
    data_dir = '/content/drive/My Drive/MAGMA: Summarization/datasets/chunk_chapter/assign_bullets/pegasus/'

### **Init**

In [23]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import re
import pandas as pd
from tqdm import tqdm
from textwrap import fill

### **Function Definition**

##### Import Model and Tok

In [24]:
def import_model_tok(model_name_or_path):
    global MODELS

    if model_name_or_path in MODELS.keys():
        print('[+] model already present in cache\n')
        return MODELS[model_name_or_path]
    print('[*] importing the model\n')

    if 'bart' in MODEL:
        from transformers import BartForConditionalGeneration, BartTokenizer

        model = BartForConditionalGeneration.from_pretrained(model_name_or_path)
        tokenizer = BartTokenizer.from_pretrained(model_name_or_path)
    elif 'pegasus' in MODEL:
        from transformers import PegasusForConditionalGeneration, PegasusTokenizer

        model = PegasusForConditionalGeneration.from_pretrained(model_name_or_path)
        tokenizer = PegasusTokenizer.from_pretrained(model_name_or_path)

    MODELS[model_name_or_path] = model, tokenizer
    print('[+] the model is now present in cache\n')
    return MODELS[model_name_or_path]

##### Nice Print

In [25]:
def print_example(idx, text, summ, bull):
    print(idx)
    print(fill(text, 150))
    print()
    print('Prediction:')
    print(fill(summ, 100))
    print()
    print('Reference:')
    print(fill(bull, 100))
    print()
    print(''.join(['#']*100))
    print()

##### Print Examples

In [26]:
def print_examples(model_name_or_path, df, n_examples=10):
    model, tokenizer = import_model_tok(model_name_or_path)
    model = model.to(device)
    df_examples = df.sample(n_examples, axis='index', random_state=config.SEED)
    
    for idx, row in df_examples.iterrows():

        summ_enc = model.generate(
            tokenizer.encode(row.text, return_tensors='pt').to(device),
            min_length = config.ONE_BULLET_MIN_LEN,
            max_length = config.ONE_BULLET_MAX_LEN,
            length_penalty = config.LENGTH_PENALTY,
            num_beams = config.NUM_BEAMS,
            no_repeat_ngram_size = config.NO_REPEAT_NGRAM_SIZE,
            early_stopping = True)[0]
        summ = tokenizer.decode(summ_enc, skip_special_tokens=True)

        print_example(idx, row.text, summ, row.bullets)

##### Plot Evaluation

In [27]:
def plot_evaluation(model_name_or_path):
    df = pd.read_csv(OUTPUT_PATH+model_name_or_path.replace('/', '?')+'.csv').set_index(['book', 'chapter'])

    prf = ['precision', 'recall', 'fmeasure']
    num_rouge = len(config.ROUGE_TYPES)

    from matplotlib.cm import get_cmap
    color = get_cmap('tab10')(range(num_rouge))
    def set_box_color(b, c):
        for k in b.keys():
            plt.setp(b[k], color=c)
    
    xticks = 2*np.array(np.arange(1, num_rouge+1))

    fig, ax = plt.subplots(figsize=(10, 6))
    box_plt_list = [plt.boxplot(
        [df[rouge+'_'+r+'_'+model_name_or_path].tolist() for r in prf],
        positions= xticks+var,
        sym='+',
        widths=0.4,
        patch_artist=False,
        meanline=True,
        showmeans=True)\
        for rouge, var in zip(config.ROUGE_TYPES,
            np.linspace(-0.15*num_rouge, 0.15*num_rouge, num_rouge))]

    for i, bp in enumerate(box_plt_list):
        set_box_color(bp, color[i])
        plt.plot([], c=color[i], label=prf[i])
    plt.legend()

    ax.grid(True, axis='y', alpha=0.7, linestyle='--')
    ax.set_title('Evaluation Results', fontsize='xx-large')
    ax.set_ylabel('Rouge', fontsize='x-large')
    plt.xticks(xticks, config.ROUGE_TYPES, fontsize='x-large')
    plt.show()

## **Assign Bullets Chunk Chapter**

In [29]:
df_train = pd.read_csv(data_dir + 'train.csv').set_index(['book', 'chapter'])
df_val = pd.read_csv(data_dir + 'val.csv').set_index(['book', 'chapter'])
df_test = pd.read_csv(data_dir + 'test.csv').set_index(['book', 'chapter'])

### **Print and Summarization**

##### Print Train Examples

In [30]:
print_examples('sshleifer/distilbart-cnn-12-6', df_train)

[*] importing the model



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1621.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1222317369.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898822.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=26.0, style=ProgressStyle(description_w…


[+] the model is now present in cache

(9781912776696, 'hh-5')
Time to-event endpoints . Many endpoints in oncology are time-to-event endpoints, id est the time until a specified event of interest occurs, such as
death or the occurrence of disease . Typical examples are . overall survival (OS) . progression-free survival (PFS) . disease-free survival (DFS) .
duration of response . Often these endpoints are measured from the time the participants in a study are randomized into treatment groups . This is the
case for OS, PFS and DFS, but duration of response is measured from the time point at which partial or complete response is achieved to disease
progression . Time-to-event analyses include information from both censored and uncensored observations (see below) . Censoring is a common feature of
all time-to-event endpoints . A subject's time to event is said to be censored if the event of interest (death, disease progression etc . has not
occurred in that patient by the end of the fol

##### Print Val Examples

In [31]:
print_examples('sshleifer/distilbart-cnn-12-6', df_val)

[+] model already present in cache

(9781908541178, 'ch_6')
As often as possible, acute pain syndromes must be reversed in a timely manner to limit the possibilities of developing a chronic pain condition, and
pain-relieving strategies must be structured and coordinated to implement a prompt return to family, workplace and social activities . Anatomic
structures involved in generating pain . Many anatomic factors can play a primary or a secondary role in the development and progression of low back
pain syndromes . The most common causes of low back pain are mechanical in origin . Mechanical disorders of the lumbar spine are related to injury,
overuse or deformity of a spinal structure . The most important traumatic factors in low back pain relate to soft tissue structures . Precise
identification of the injured tissue, and the role of that injury in the consequent pain and dysfunction, can be frustratingly difficult . Problems
occur in muscles, intervertebral discs, facet joints, ligam

##### Print Test Examples

In [32]:
print_examples('sshleifer/distilbart-cnn-12-6', df_test)

[+] model already present in cache

(9781908541277, 'ch_8')
Pharmacokinetics and drug-drug interactions . As its half-life is shorter than that of PB, concentrations of PB are usually higher than those of PRM .
Like PB, PRM is a powerful enzyme inducer . Sodium valproate (VPA) . The anticonvulsant property of VPA was recognized serendipitously in 1963 when it
was used by Pierre Eymard as a solvent for a number of other compounds . VPA exerts its antiepileptic property, at least in part, by limiting
sustained repetitive firing by a use- and voltage-dependent effect on sodium channels . It also facilitates the effects of the inhibitory
neurotransmitter GABA . VPA is now established as effective over the complete range of seizure types, with particular value in the idiopathic
generalized epilepsies . The starting dose for adults and adolescents should be 500 mg once or twice daily . Alterations thereafter can be made
according to the clinical status of the patient . Divalproex sodium (a c