# Import & Setup

In [None]:
!pip install datasets transformers rouge-score nltk sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 4.0 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 54.3 MB/s 
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 58.5 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 78.8 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 61.7 MB/s 
Collecting mu

In [None]:
import pandas as pd
import io
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn
from google.colab import files
from google.colab import drive
from datasets import load_metric, Dataset
import datasets
import nltk
nltk.download('punkt')
import string
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import sentencepiece as spm

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load Dataset

In [None]:
#uploaded = files.upload()

In [None]:
#data = pd.read_csv(io.BytesIO(uploaded['arxiv_data_210930-054931.csv']))
#data

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Data Science/UCB Master of Information and Data Science (MIDS)/MIDS W266 Natural Language Processing with Deep Learning/summarizing_abstract/data/raw/arxiv_data_210930-054931.csv')
data

Unnamed: 0,terms,titles,abstracts
0,['cs.LG'],Multi-Level Attention Pooling for Graph Neural...,Graph neural networks (GNNs) have been widely ...
1,"['cs.LG', 'cs.AI']",Decision Forests vs. Deep Networks: Conceptual...,Deep networks and decision forests (such as ra...
2,"['cs.LG', 'cs.CR', 'stat.ML']",Power up! Robust Graph Convolutional Network v...,Graph convolutional networks (GCNs) are powerf...
3,"['cs.LG', 'cs.CR']",Releasing Graph Neural Networks with Different...,With the increasing popularity of Graph Neural...
4,['cs.LG'],Recurrence-Aware Long-Term Cognitive Network f...,Machine learning solutions for pattern classif...
...,...,...,...
56176,"['cs.CV', 'cs.IR']",Mining Spatio-temporal Data on Industrializati...,Despite the growing availability of big data i...
56177,"['cs.LG', 'cs.AI', 'cs.CL', 'I.2.6; I.2.7']",Wav2Letter: an End-to-End ConvNet-based Speech...,This paper presents a simple end-to-end model ...
56178,['cs.LG'],Deep Reinforcement Learning with Double Q-lear...,The popular Q-learning algorithm is known to o...
56179,"['stat.ML', 'cs.LG', 'math.OC']",Generalized Low Rank Models,Principal components analysis (PCA) is a well-...


In [None]:
datasets = Dataset.from_pandas(data)
datasets

Dataset({
    features: ['terms', 'titles', 'abstracts'],
    num_rows: 56181
})

# Train/Validation/Test Split

In [None]:
train_dataset, validation_dataset = datasets.train_test_split(test_size=0.1).values()

In [None]:
train_dataset, test_dataset = train_dataset.train_test_split(test_size=0.1).values()

In [None]:
import datasets
datasets = datasets.DatasetDict({"train":train_dataset,"test":test_dataset, "validation":validation_dataset})
datasets

DatasetDict({
    train: Dataset({
        features: ['terms', 'titles', 'abstracts'],
        num_rows: 45505
    })
    test: Dataset({
        features: ['terms', 'titles', 'abstracts'],
        num_rows: 5057
    })
    validation: Dataset({
        features: ['terms', 'titles', 'abstracts'],
        num_rows: 5619
    })
})

In [None]:
datasets["train"] = datasets["train"].shuffle().select(range(20000))
datasets["validation"] = datasets["validation"].shuffle().select(range(2000))
datasets["test"] = datasets["test"].shuffle().select(range(2000))

# Load Model from G-Drive

In [None]:
model_name = "pegasus-saata-baseline"
model_dir = f"drive/MyDrive/Models/{model_name}"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

max_input_length = 512

# Transformer Pointer Generator (TPG) Model Proof-of-concept

To address this potential issue, we propose a novel variant of the original pointer-generator network architecture, *TPG (transformer point-er-generator)* model, to enable the model to copy words from the source input text via *pointing*, which we expect to improve the per-formance of text summarization tasks of our baseline fine-tuned seq2seq models.

The proposed hybrid pointer-generator network architecture combines the vocabulary distributions (prediction output) and multi-head cross attention distributions generated from a pre-trained text summarization seq2seq transformer models (i.e., BART, PEGASUS, T5) to predict the generation probability for each decoder timestep $p_{gen}∈ [0,1]$ . The probability $p_{gen}$ for timestep $t$ is calculated from the context vector $h^*_t$, the decoder state $s_t$ and the decoder input $x_t$:

$p_{gen}=σ(w_h^T*h_t^*+w_s^T*s_t+w_x^T*x_t+b_ptr )$

where vectors $w_h$ , $w_s$, $w_x$ and scalar $b_ptr$ are learnable parameters and σ is the sigmoid function

<div>
<img src="attachment:TPG_transformer_pointer_generator_network_revised.jpg" width="500"/>
</div>

We can extract context vector $h^*_t$, the decoder state $s_t$ and the decoder input $x_t$ from the pretrained fine-tuned PEGASUS model:

In [None]:
print(model)

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(1024, 1024)
      (layers): ModuleList(
        (0): PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): Lay

Given a test example, we can extract the transformer hidden states and encoder-decoder cross-attention by the `PegasusForConditionalGeneration.forward()` method.

In [None]:
abstract = datasets['test']['abstracts'][0]
inputs = ["summarize: " + abstract]
inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
decoder_inputs = tokenizer("", return_tensors="pt")
outputs = model.forward(input_ids=inputs.input_ids,
                        attention_mask=inputs.attention_mask,
                        decoder_input_ids=decoder_inputs.input_ids,
                        output_hidden_states=True,
                        output_attentions=True)

In [None]:
print(abstract)

In the past decade the mathematical theory of machine learning has lagged far
behind the triumphs of deep neural networks on practical challenges. However,
the gap between theory and practice is gradually starting to close. In this
paper I will attempt to assemble some pieces of the remarkable and still
incomplete mathematical mosaic emerging from the efforts to understand the
foundations of deep learning. The two key themes will be interpolation, and its
sibling, over-parameterization. Interpolation corresponds to fitting data, even
noisy data, exactly. Over-parameterization enables interpolation and provides
flexibility to select a right interpolating model.
  As we will see, just as a physical prism separates colors mixed within a ray
of light, the figurative prism of interpolation helps to disentangle
generalization and optimization properties within the complex picture of modern
Machine Learning. This article is written with belief and hope that clearer
understanding 

In [None]:
print("Vocabulary Distribution:")
print(outputs.decoder_hidden_states)

Vocabulary Distribution:
(tensor([[[-1.8576e+01, -7.7758e-03,  5.4886e+01,  ..., -3.9867e+00,
           1.2295e+01, -2.6963e+00]]], grad_fn=<AddBackward0>), tensor([[[-10.0656, -12.6732,  34.3188,  ...,  -6.6054,   7.9227,  -7.4615]]],
       grad_fn=<AddBackward0>), tensor([[[10.0759,  4.1235, 53.9034,  ...,  6.6956,  1.8556,  3.8106]]],
       grad_fn=<AddBackward0>), tensor([[[  0.2483,   5.4233,  55.6269,  ..., -16.7475, -23.3910,  32.6636]]],
       grad_fn=<AddBackward0>), tensor([[[ 56.6540, -41.3813,  72.5320,  ...,   6.7152, -32.4193,  58.9642]]],
       grad_fn=<AddBackward0>), tensor([[[101.8255,   8.6658, 117.4852,  ...,  14.5300,   1.6791,  58.7417]]],
       grad_fn=<AddBackward0>), tensor([[[112.8141, -25.6616, 111.3830,  ...,  51.0570,   6.7364,  91.6880]]],
       grad_fn=<AddBackward0>), tensor([[[115.7590, -34.1688, 114.0858,  ...,  47.8927,  10.8204, 149.2177]]],
       grad_fn=<AddBackward0>), tensor([[[ 91.7853, -61.0717, 101.2149,  ...,  52.6648, -39.4530, 116.6

In [None]:
print("Attention Distribution:")
print(outputs.cross_attentions)

Attention Distribution:
(tensor([[[[3.7598e-04, 6.3400e-03, 1.4692e-04,  ..., 9.7645e-05,
           2.9446e-04, 1.9355e-03]],

         [[2.3703e-06, 5.5529e-04, 4.5900e-05,  ..., 7.6131e-06,
           2.3455e-03, 5.1132e-03]],

         [[1.4488e-03, 2.8840e-03, 1.7712e-03,  ..., 4.4406e-04,
           9.1826e-04, 5.7548e-03]],

         ...,

         [[3.5904e-03, 5.6379e-03, 9.0207e-04,  ..., 2.4047e-04,
           4.2404e-04, 4.8209e-04]],

         [[3.0275e-03, 1.0120e-02, 3.3006e-03,  ..., 1.0809e-03,
           6.3440e-03, 3.4218e-02]],

         [[9.7725e-04, 7.2858e-03, 7.7255e-04,  ..., 2.1925e-05,
           8.1245e-04, 7.6313e-03]]]], grad_fn=<ViewBackward0>), tensor([[[[1.2403e-06, 7.4732e-04, 2.7157e-05,  ..., 5.9382e-04,
           3.7734e-04, 1.7455e-03]],

         [[1.3258e-03, 2.1206e-03, 6.6782e-04,  ..., 1.7513e-03,
           8.2910e-03, 1.9901e-02]],

         [[2.0986e-04, 6.2689e-03, 2.0640e-04,  ..., 3.1721e-04,
           1.4198e-02, 4.7888e-03]],

      

In [None]:
print("Vocabulary Distribution:")
print(outputs.logits)

Vocabulary Distribution:
tensor([[[-0.4234, 14.2356,  0.5334,  ..., -7.1022, -4.2433, -9.4791]]],
       grad_fn=<AddBackward0>)


Context vector $h^*_t$ is the dot product of multi-headed encoder-decoder cross-attention (sum of 16 heads of $1\times189$ vector) and encoder last hidden state ($189\times1024 vector$):

In [None]:
print(outputs.cross_attentions[-1].size())

torch.Size([1, 16, 1, 189])


In [None]:
print(outputs.encoder_last_hidden_state)

tensor([[[-0.2002, -0.1047, -0.0696,  ...,  0.1101, -0.0118,  0.0643],
         [-0.1280, -0.0091, -0.1026,  ..., -0.1198,  0.0205,  0.1092],
         [ 0.0200, -0.0108, -0.1908,  ..., -0.2198, -0.0190,  0.0378],
         ...,
         [-0.0116,  0.2254, -0.2567,  ...,  0.0488, -0.1272, -0.0536],
         [ 0.1791,  0.0673, -0.2450,  ...,  0.2363, -0.1964,  0.0312],
         [ 0.2136, -0.1332, -0.0578,  ..., -0.0558, -0.1657, -0.1278]]],
       grad_fn=<NativeLayerNormBackward0>)


In [None]:
print(outputs.encoder_last_hidden_state.size())

torch.Size([1, 189, 1024])


The decoder state $s_t$ is a  $1\times1024$ vector:

In [None]:
print(outputs.decoder_hidden_states[-1])

tensor([[[-0.1768, -0.0050,  0.2316,  ..., -0.1031, -0.0421, -0.0165]]],
       grad_fn=<NativeLayerNormBackward0>)


In [None]:
print(outputs.decoder_hidden_states[-1].size())

torch.Size([1, 1, 1024])


The decoder input $x_t$ is a $1\times1024$ vector, and would be updated for each decoder timestep $t$:

In [None]:
print(outputs.decoder_hidden_states[0])

tensor([[[-1.8576e+01, -7.7758e-03,  5.4886e+01,  ..., -3.9867e+00,
           1.2295e+01, -2.6963e+00]]], grad_fn=<AddBackward0>)


In [None]:
print(outputs.decoder_hidden_states[0].size())

torch.Size([1, 1, 1024])


The $p_{gen}$ can calculated from a sigmoid function applied on a linear equation of $p_{gen}=σ(w_h^T*h_t^*+w_s^T*s_t+w_x^T*x_t+b_ptr )$, resulting in a vector size of $1\times96,103$:

In [None]:
lin = torch.nn.Linear(1024, 1024)
sig = torch.sigmoid(1024, 96103)
linear_h = torch.nn.linear(torch.dot(outputs.cross_attentions[-1],outputs.encoder_last_hidden_state))
lienar_s = torch.nn.linear(outputs.decoder_hidden_states[-1])
lienar_x = torch.nn.linear(outputs.decoder_hidden_states[0])
p_gen = sig(lin(torch.cat(linear_h, linear_s, linear_x))

The PEGASUS transformer has a probability output given a vector size of $1\times96,103$:

In [None]:
print(outputs.logits.size())

torch.Size([1, 1, 96103])


The final distribution can be calculated given the decoder input, $p_{gen}$, vocabulary distribution (logits output from pretrained transformer model) context vector (attention distribution):

In [None]:
def _calc_final_dist(self, x, p_gens, vocab_dists, attn_dists):
    """Calculate the final distribution, for the pointer-generator model
    Args:
      x: encoder input which contain oov number
      p_gens: the generation probability, choose vocab from article or vocab
      vocab_dists: The vocabulary distributions
      attn_dists: The attention distributions
    Returns:
      final_dists: The final distributions
    """
    with tf.variable_scope('final_distribution', reuse=tf.AUTO_REUSE):
        # Multiply vocab dists by p_gen and attention dists by (1-p_gen)
        vocab_dists = p_gens * vocab_dists
        attn_dists = (1-p_gens) * attn_dists
        batch_size = tf.shape(attn_dists)[0]
        dec_t = tf.shape(attn_dists)[1]
        attn_len = tf.shape(attn_dists)[2]
        dec = tf.range(0, limit=dec_t) # [dec]
        dec = tf.expand_dims(dec, axis=-1) # [dec, 1]
        dec = tf.tile(dec, [1, attn_len]) # [dec, atten_len]
        dec = tf.expand_dims(dec, axis=0) # [1, dec, atten_len]
        dec = tf.tile(dec, [batch_size, 1, 1]) # [batch_size, dec, atten_len]
        x = tf.expand_dims(x, axis=1) # [batch_size, 1, atten_len]
        x = tf.tile(x, [1, dec_t, 1]) # [batch_size, dec, atten_len]
        x = tf.stack([dec, x], axis=3)
        attn_dists_projected = tf.map_fn(fn=lambda y: tf.scatter_nd(y[0], y[1], [dec_t, self.hp.vocab_size]),
                                         elems=(x, attn_dists), dtype=tf.float32)
        final_dists = attn_dists_projected + vocab_dists
    return final_dists