# Import & Setup

In [16]:
!pip install datasets transformers rouge-score nltk sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [17]:
import pandas as pd
import io
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn
from google.colab import files
from google.colab import drive
from datasets import load_metric, Dataset
import datasets
import nltk
nltk.download('punkt')
import string
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import sentencepiece as spm

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Dataset

In [19]:
#uploaded = files.upload()

In [20]:
#data = pd.read_csv(io.BytesIO(uploaded['arxiv_data_210930-054931.csv']))
#data

In [21]:
data = pd.read_csv('/content/drive/MyDrive/Data Science/UCB Master of Information and Data Science (MIDS)/MIDS W266 Natural Language Processing with Deep Learning/summarizing_abstract/data/raw/arxiv_data_210930-054931.csv')
data

Unnamed: 0,terms,titles,abstracts
0,['cs.LG'],Multi-Level Attention Pooling for Graph Neural...,Graph neural networks (GNNs) have been widely ...
1,"['cs.LG', 'cs.AI']",Decision Forests vs. Deep Networks: Conceptual...,Deep networks and decision forests (such as ra...
2,"['cs.LG', 'cs.CR', 'stat.ML']",Power up! Robust Graph Convolutional Network v...,Graph convolutional networks (GCNs) are powerf...
3,"['cs.LG', 'cs.CR']",Releasing Graph Neural Networks with Different...,With the increasing popularity of Graph Neural...
4,['cs.LG'],Recurrence-Aware Long-Term Cognitive Network f...,Machine learning solutions for pattern classif...
...,...,...,...
56176,"['cs.CV', 'cs.IR']",Mining Spatio-temporal Data on Industrializati...,Despite the growing availability of big data i...
56177,"['cs.LG', 'cs.AI', 'cs.CL', 'I.2.6; I.2.7']",Wav2Letter: an End-to-End ConvNet-based Speech...,This paper presents a simple end-to-end model ...
56178,['cs.LG'],Deep Reinforcement Learning with Double Q-lear...,The popular Q-learning algorithm is known to o...
56179,"['stat.ML', 'cs.LG', 'math.OC']",Generalized Low Rank Models,Principal components analysis (PCA) is a well-...


In [22]:
datasets = Dataset.from_pandas(data)
datasets

Dataset({
    features: ['terms', 'titles', 'abstracts'],
    num_rows: 56181
})

# Train/Validation/Test Split

In [23]:
train_dataset, validation_dataset = datasets.train_test_split(test_size=0.1).values()

In [24]:
train_dataset, test_dataset = train_dataset.train_test_split(test_size=0.1).values()

In [25]:
import datasets
datasets = datasets.DatasetDict({"train":train_dataset,"test":test_dataset, "validation":validation_dataset})
datasets

DatasetDict({
    train: Dataset({
        features: ['terms', 'titles', 'abstracts'],
        num_rows: 45505
    })
    test: Dataset({
        features: ['terms', 'titles', 'abstracts'],
        num_rows: 5057
    })
    validation: Dataset({
        features: ['terms', 'titles', 'abstracts'],
        num_rows: 5619
    })
})

In [26]:
datasets["train"] = datasets["train"].shuffle().select(range(20000))
datasets["validation"] = datasets["validation"].shuffle().select(range(2000))
datasets["test"] = datasets["test"].shuffle().select(range(2000))

# Tokenization

In [27]:
prefix = "summarize: "

max_input_length = 512
max_target_length = 64

def preprocess_data(examples):
  inputs = [prefix + text for text in examples["abstracts"]]
  model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

  # Setup the tokenizer for targets
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(examples["titles"], max_length=max_target_length, 
                       truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs
     

In [29]:
datasets_cleaned = datasets.filter(lambda example: (len(example['abstracts']) >= 500) and (len(example['titles']) >= 20))

  0%|          | 0/20 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [30]:
model_checkpoint = 'google/pegasus-large'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

In [31]:
tokenized_datasets = datasets_cleaned.map(preprocess_data, batched=True)
tokenized_datasets

  0%|          | 0/20 [00:00<?, ?ba/s]



  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['terms', 'titles', 'abstracts', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 19794
    })
    test: Dataset({
        features: ['terms', 'titles', 'abstracts', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1987
    })
    validation: Dataset({
        features: ['terms', 'titles', 'abstracts', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1977
    })
})

# Load Model from G-Drive

In [32]:
model_name = "pegasus-saata-baseline"
model_dir = f"drive/MyDrive/Models/{model_name}"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

max_input_length = 512

# Replace Most Frequent Technical Term with new Token

In [33]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [34]:
# extract the vocabulary from the tokenizer
vocab = tokenizer.get_vocab()

In [35]:
# testing vocabulary
print(f"Is 'BERT' part of the tokenizer vocabulary? {'BERT' in vocab.keys()}")
print(f"Is 'neurons' part of the tokenizer vocabulary? {'medulla' in vocab.keys()}")
# medulla oblongata: The lower part of the brain stem, responsible for life-regulating functions like breathing and heart rate
print(f"Is 'medulla' part of the tokenizer vocabulary? {'medulla' in vocab.keys()}")

Is 'BERT' part of the tokenizer vocabulary? True
Is 'neurons' part of the tokenizer vocabulary? False
Is 'medulla' part of the tokenizer vocabulary? False


In [36]:
from statistics import mode

In [37]:
# identify new vocabulary given an input corpus and the tokenizer vocabulary
def new_vocab_id(text, vocab):
  results = []
  stopwords   = set(nltk.corpus.stopwords.words('english'))
  punctuation = string.punctuation
  for token in nltk.word_tokenize(text):
    if token not in stopwords and token not in punctuation:
      results.append(token)
  results = [x for x in results if x not in vocab]
  return max(set(results), key = results.count)

Example on an abstract that contains neuro-science terms that are not part of the pretrained model vocabulary:

In [38]:
abstract_example = "Mapping the connectivity of neurons in the brain (i.e., connectomics) is a challenging problem due to both the number of connections in even the smallest organisms and the nanometer resolution required to resolve them. Because of this, previous connectomes contain only hundreds of neurons, such as in the C.elegans connectome. Recent technological advances will unlock the mysteries of increasingly large connectomes (or partial connectomes). However, the value of these maps is limited by our ability to reason with this data and understand any underlying motifs. To aid connectome analysis, we introduce algorithms to cluster similarly-shaped neurons, where 3D neuronal shapes are represented as skeletons. In particular, we propose a novel location-sensitive clustering algorithm. We show clustering results on neurons reconstructed from the Drosophila medulla that show high-accuracy."
new_vocab = new_vocab_id(abstract_example, vocab)
print(new_vocab)

neurons


In [39]:
abstract_example = abstract_example.replace(new_vocab, '<unk>')
print(abstract_example)

Mapping the connectivity of <unk> in the brain (i.e., connectomics) is a challenging problem due to both the number of connections in even the smallest organisms and the nanometer resolution required to resolve them. Because of this, previous connectomes contain only hundreds of <unk>, such as in the C.elegans connectome. Recent technological advances will unlock the mysteries of increasingly large connectomes (or partial connectomes). However, the value of these maps is limited by our ability to reason with this data and understand any underlying motifs. To aid connectome analysis, we introduce algorithms to cluster similarly-shaped <unk>, where 3D neuronal shapes are represented as skeletons. In particular, we propose a novel location-sensitive clustering algorithm. We show clustering results on <unk> reconstructed from the Drosophila medulla that show high-accuracy.


# Evaluation on the `[unk]` Token Replacement Pre-processing

Applying the token `[unk]` replacement of new vocabulary in abstract test examples to generate summarized titles:

In [42]:
metric = load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

  metric = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [43]:
i = 100
results_1 = []
all_predictions_1 = []
all_actual_1 = []
while i > 0:
  abstract = datasets['test']['abstracts'][i]
  # extract the vocabulary from the tokenizer
  vocab = tokenizer.get_vocab()
  # find the new vocab given the abstract
  new_vocab = new_vocab_id(abstract, vocab)
  # replace
  abstract = abstract.replace(new_vocab, '<unk>')
  inputs = ["summarize: " + abstract]
  inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
  output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=10, max_length=64)
  all_predictions_1.append(output)
  decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
  predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
  predicted_title = predicted_title.replace('<unk>', new_vocab)
  actual_title = datasets['test']['titles'][i]
  all_actual_1.append(tokenizer(actual_title, max_length=max_target_length,
                       truncation=True, padding="max_length")["input_ids"])
  results_1.append([actual_title, predicted_title])
  i -= 1

results_1 = pd.DataFrame(results_1)
results_1.columns = ['Actual Title', 'Predicted Title']
results_1

Unnamed: 0,Actual Title,Predicted Title
0,Subjective Annotation for a Frame Interpolatio...,Visual Quality Assessment for Optical Flow and...
1,Scientific Discovery by Generating Counterfact...,A Framework for Model Explanation and Scientif...
2,Supervised Classification Methods for Flash X-...,Eigen-Image and Log-Likelihood Classifiers for...
3,A Neural Markovian Multiresolution Image Label...,A Formal Evaluation of the MCV (Markov Concurr...
4,StyleCLIP: Text-Driven Manipulation of StyleGA...,Text-Guided Latent Spaces for StyleGAN Image M...
...,...,...
95,Neural Embedding for Physical Manipulations,Learning the Full Topology of Action and State...
96,Learning to Discover Novel Visual Categories v...,Deep Embedded Clustering: A Transfer Learning ...
97,An Accurate and Real-time Self-blast Glass Ins...,A Low Signal-Noise-ratio Image Location Framew...
98,Professor Forcing: A New Algorithm for Trainin...,Professor: A Regularizer for Multi-Step Sampling


In [44]:
# flatten predictions
all_predictions_1_flattened = [pred for preds in all_predictions_1 for pred in preds]

# compute metrics
predictions_labels_1 = [all_predictions_1_flattened, all_actual_1]
compute_metrics(predictions_labels_1)

{'rouge1': 44.9488,
 'rouge2': 26.4669,
 'rougeL': 39.9096,
 'rougeLsum': 39.977,
 'gen_len': 17.21}

Compare against the original pegasus model summarized titles:

In [45]:
i = 100
results_2 = []
all_predictions_2 = []
all_actual_2 = []
while i > 0:
  abstract = datasets['test']['abstracts'][i]
  inputs = ["summarize: " + abstract]
  inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
  output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=10, max_length=64)
  all_predictions_2.append(output)
  decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
  predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
  actual_title = datasets['test']['titles'][i]
  all_actual_2.append(tokenizer(actual_title, max_length=max_target_length,
                       truncation=True, padding="max_length")["input_ids"])
  results_2.append([actual_title, predicted_title])
  i -= 1

results_2 = pd.DataFrame (results_2)
results_2.columns = ['Actual Title', 'Predicted Title']
results_2

Unnamed: 0,Actual Title,Predicted Title
0,Subjective Annotation for a Frame Interpolatio...,A Crowdsourced Survey on Visual Quality Assess...
1,Scientific Discovery by Generating Counterfact...,A Framework for Model Explanation and Scientif...
2,Supervised Classification Methods for Flash X-...,Eigen-Image and Log-Likelihood Classifiers for...
3,A Neural Markovian Multiresolution Image Label...,MCV: Markov Concurrent Vision for Image Labeling
4,StyleCLIP: Text-Driven Manipulation of StyleGA...,Leveraging Contrastive Language-Image Pre-trai...
...,...,...
95,Neural Embedding for Physical Manipulations,Learning the Full Topology of Action and State...
96,Learning to Discover Novel Visual Categories v...,Deep Embedded Clustering: A Transfer Learning ...
97,An Accurate and Real-time Self-blast Glass Ins...,A Low Signal-Noise-ratio Image Location Framew...
98,Professor Forcing: A New Algorithm for Trainin...,Professor Forcing: An Adversarial Domain Adapt...


In [46]:
# flatten predictions
all_predictions_2_flattened = [pred for preds in all_predictions_2 for pred in preds]

# compute metrics
predictions_labels_2 = [all_predictions_2_flattened, all_actual_2]
compute_metrics(predictions_labels_2)

{'rouge1': 48.6194,
 'rouge2': 30.1191,
 'rougeL': 42.3751,
 'rougeLsum': 42.4474,
 'gen_len': 17.4}