#### For Colab

In [1]:
"""
function ClickConnect(){
    console.log("Working");
    document.querySelector("colab-toolbar-button").click() 
}
var i = setInterval(ClickConnect, 900000)
clearInterval(i)
"""

'\nfunction ClickConnect(){\n    console.log("Working");\n    document.querySelector("colab-toolbar-button").click() \n}\nvar i = setInterval(ClickConnect, 900000)\nclearInterval(i)\n'

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
drive_dir = '/content/drive/My Drive/MAGMA: Summarization/'

#### Install Libraries

In [4]:
!pip install -U transformers
!pip install -U gensim
!pip install -U datasets
!pip install rouge-score

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 8.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 21.1MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 52.5MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=03c65ed24b9118bc4f6

### **Config**

In [10]:
import sys
sys.path.insert(0, drive_dir)
import config

In [18]:
RE_SPLITTER = '\.(?!\d)|\n'     # do we split sentences of paragraphs?
                                # use '\.|\n' or '\n', respectively

TOKEN_MAX_LEN = 99              # max length of a word
PARA_MIN_LENGTH = 2             # minimum length for a sentence or
                                # a paragraph, in tokens

# Output path
OUTPUT_PATH = drive_dir+'summarization/textrank_output/'

### **Init**

In [15]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import re
import pandas as pd
import gensim

if 'pegasus' in config.MODEL:
    from transformers import PegasusTokenizer
    tokenizer =\
        PegasusTokenizer.from_pretrained('google/pegasus-large')
elif 'bart' in config.MODEL:
    from transformers import BartTokenizer
    tokenizer =\
        BartTokenizer.from_pretrained('facebook/bart-large-cnn')

### **Karger Books Base Dataset**

In [20]:
base_dataset = drive_dir+'datasets/karger_books_base/df.csv'
df = pd.read_csv(base_dataset)
df = df.set_index(['book', 'chapter', 'section', 'subsection'])
df.bullets = df.bullets.map(eval, na_action='ignore')

## **Topic modeling**

### **Preprocessing**

#### Preprocessing

* Split based on RE_SPLITTER
* Explode the dataset
* Remove unwanted chars at beginning or end of sentence
* Remove multiple spaces
* Remove long words (> TOKEN_MAX_LEN chars)
* Remove short sentences / paragraphs (< PARA_MIN_LENGTH tokens)

In [21]:
# Split in sentences / paragraphs based on RE_SPLITTER
df.text =\
    df.text.map(lambda x: [p.strip() for p in re.split(RE_SPLITTER, x) if p!=''],
                na_action = 'ignore')
    
# explode to get one row for each paragraph /sentence
df = df.explode('text')
df = df.rename(columns={'text': 'para'})
df = df.dropna()

# Remove unwanted chars at beginning or end of sentence
df.para = df.para.map(lambda p: p.lstrip('0123456789.,;: \n'))
df.para = df.para.map(lambda p: p.rstrip('.,;: \n'))

# Remove multiple spaces
df.para = df.para.map(lambda p:
    re.sub('\s+', ' ', p).strip())

# Remove long words (> TOKEN_MAX_LEN chars)
def para2words(para):
    return gensim.utils.simple_preprocess(
        para, deacc=True, max_len=TOKEN_MAX_LEN)
df['para_proc'] = df.para.map(para2words)

# Remove short sentences / paragraphs (< PARA_MIN_LENGTH tokens)
df.loc[df.para_proc.map(len) <\
    PARA_MIN_LENGTH, 'para_proc'] = np.nan

df = df.dropna()

### **TextRank summarization**

In [None]:
df = df.groupby(level=[0, 1], sort=False).agg(
    {'para': lambda t: ''.join([p+' . ' for p in t]),
     'bullets': lambda b: list(b)[0]})
df = df.rename(columns={'para': 'text'})

In [None]:
from gensim.summarization.summarizer import summarize

df['textrank_summary'] = ''
for book, ch in set(zip(df.index.get_level_values(0),
                        df.index.get_level_values(1))):
    # TextRank
    textrank_redu = summarize(
        df.loc[book, ch].text,
        word_count = int(0.8*(config.BULLETS_MAX_LEN+config.BULLETS_MIN_LEN)/2),
        split = True)
    df.loc[(book, ch), 'textrank_summary'] = ' '.join(textrank_redu)

#### Number of tokens in the summary

In [None]:
print(df['textrank_summary'].map(tokenizer.encode).map(len).describe())
print()

### **Evaluation**

* Calculate Rouge scores for reductions and summaries
* Saving the results in CSV files

In [None]:
from datasets import load_metric

metric = load_metric("rouge")

#### Evaluating summaries

In [None]:
rouge_res =\
    df[['bullets', 'textrank_summary']]\
    .apply(lambda row:
    metric.compute(
        predictions = [row[1]],
        references = [' '.join(row[0]).strip()],
        rouge_types = config.ROUGE_TYPES,
        use_agregator = False), axis=1)

for r in ROUGE_TYPES:
    for i, prf in enumerate(['precision', 'recall', 'fmeasure']):
        df[r+'_'+prf] =\
            rouge_res.map(lambda score: score[r][0][i])

#### Saving results

In [None]:
df.to_csv(OUTPUT_PATH+'df_textrank_summ.csv')