In [1]:
import tensorflow_datasets as tfds
import tensorflow as tf
import os
import struct
import hashlib
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig
import tensorflow_hub as hub
import tokenization
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer
from rouge_score import rouge_scorer
from sklearn.cluster import KMeans
import pickle
from sknetwork.ranking import PageRank
from nltk.stem.snowball import EnglishStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
def tensor_to_string(x):
    return x.numpy().decode('UTF-8')

In [3]:
def get_sent_list(text,stem=None):
    sents = sent_tokenize(text)
    if stem == "None":
        return sents
    ans = []
    for sent in sents:
        words = word_tokenize(sent)
        word_stem = [stem.stem(w) for w in words]
        ans.append(" ".join(word_stem))
    return ans

In [4]:
stemmer = EnglishStemmer()

In [5]:
## Model features include an encode function -> takes a list of sentences. Returns a list of embeddings (all same dim)
# transformers = ["all-mpnet-base-v2",
#                 "multi-qa-mpnet-base-dot-v1",
#                 "all-distilroberta-v1",
#                 "all-MiniLM-L12-v2",
#                 "multi-qa-distilbert-cos-v1",
#                 "all-MiniLM-L6-v2",
#                 "multi-qa-MiniLM-L6-cos-v1",
#                 "paraphrase-multilingual-mpnet-base-v2",
#                 "paraphrase-albert-small-v2",
#                 "paraphrase-multilingual-MiniLM-L12-v2",
#                 "paraphrase-MiniLM-L3-v2",
#                 "distiluse-base-multilingual-cased-v1",
#                 "distiluse-base-multilingual-cased-v2"]

transformers = ["all-MiniLM-L6-v2"]

In [6]:
## V = list of embeddings. k = target size of summary
## Returns a list of sentence indices

def generate_summary(V, k):
    if k >= len(V):
        return list(range(len(V)))
    k -= 1
    centers = []
    cities = list(range(len(V)))
    centers.append(0)
    cities.remove(0)
    while k!= 0:
        city_dict = {}
        for cty in cities:
            min_dist = float("inf")
            for c in centers:
                min_dist = min(min_dist,np.linalg.norm(V[cty] - V[c]))
            city_dict[cty] = min_dist
        new_center = max(city_dict, key = lambda i: city_dict[i])
        centers.append(new_center)
        cities.remove(new_center)
        k -= 1
    return centers

In [None]:
## Pagerank version
def sim(a, b):
    return np.dot(a, b) / np.sqrt(np.dot(a, a) * np.dot(b, b))

def generate_summary(V,k):
    if k >= len(V):
        return list(range(len(V)))
    n = V.shape[0]
    adj = np.zeros((n, n))
    for i in range(n):
        adj[i][i] = sim(V[i],V[i])
        for j in range(i+1,n):
            s = sim(V[i], V[j])
            adj[i][j] = s
            adj[j][i] = s

    pr = PageRank()
    scores = pr.fit_transform(adj)
    ind = np.argpartition(scores, -k)[-k:]
    return np.sort(ind)

In [7]:
def uml_summary(x,index,kind="cnn_dailymail",model=None):
    if kind == "cnn_dailymail":
        key1 = 'article'
        key2 = 'highlights'
    elif kind == "scientific_papers/arxiv" or kind == "scientific_papers/pubmed":
        key1 = 'article'
        key2 = 'abstract'
    text = tensor_to_string(x[key1])
    text = get_sent_list(text,stemmer)
    summary = tensor_to_string(x[key2])
    summary = get_sent_list(summary,stemmer)
    text_emb = model.encode(text)
    filename = str(index) + "_" + model + ".pickle"
    folderpath = os.path.join("/mnt/disks/disk-1/data/pickle",kind)
    if not os.path.exists(folderpath):
        os.mkdir(folderpath)
    filepath = os.path.join("/mnt/disks/disk-1/data/pickle",kind,filename)
    with open(filepath, 'wb') as handle:
        pickle.dump(text_emb, handle, protocol=pickle.HIGHEST_PROTOCOL)
    gen_sum = [text[x] for x in generate_summary(text_emb,len(summary))]
    scores = scorer.score(" ".join(summary)," ".join(gen_sum))
    return scores["rouge1"].fmeasure, scores["rouge2"].fmeasure, scores["rougeL"].fmeasure

In [8]:
datasets = ["cnn_dailymail","scientific_papers/arxiv","scientific_papers/pubmed"]

for ds in datasets:
    for trans in transformers:
        train, val, test = tfds.load(name=ds, 
                              split=["train", "validation", "test"], 
                              data_dir="/mnt/disks/disk-1/data")
        
        model = SentenceTransformer(trans,cache_folder='/mnt/disks/disk-1/data/models')
        r1 = []
        r2 = []
        rl = []
        index = 0
        dataset = test.take(1)
        for x in dataset:
            r1_val,r2_val,rl_val = uml_summary(x,index,kind=ds,model=trans)
            index += 1
            r1.append(r1_val)
            r2.append(r2_val)
            rl.append(rl_val)
            print(index)
            if index > 100:
                break
        print(ds,trans)
        print("Rouge 1 : ",np.round(np.mean(np.asarray(r1))*100,2))
        print("Rouge 2 : ",np.round(np.mean(np.asarray(r2))*100,2))
        print("Rouge L : ",np.round(np.mean(np.asarray(rl))*100,2))
        print("___")

INFO:absl:Load dataset info from /mnt/disks/disk-1/data/cnn_dailymail/3.1.0
INFO:absl:Reusing dataset cnn_dailymail (/mnt/disks/disk-1/data/cnn_dailymail/3.1.0)
INFO:absl:Constructing tf.data.Dataset cnn_dailymail for split ['train', 'validation', 'test'], from /mnt/disks/disk-1/data/cnn_dailymail/3.1.0
2021-11-28 22:05:31.301316: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-28 22:05:31.794169: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38464 MB memory:  -> device: 0, name: A100-SXM4-40GB, pci bus id: 0000:00:04.0, compute capability: 8.0
2021-11-28 22:05:34.673186: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185]

TypeError: encode() argument 1 must be str, not list

In [9]:
text = tensor_to_string(x['article'])
text = get_sent_list(text,stemmer)
summary = tensor_to_string(x['highlights'])
summary = get_sent_list(summary,stemmer)
print(text)

['ever notic how plane seat appear to be get smaller and smaller ?', 'with increas number of peopl take to the sky , some expert are question if have such pack out plane is put passeng at risk .', "they say that the shrink space on aeroplan is not onli uncomfort - it 's put our health and safeti in danger .", 'more than squabbl over the arm rest , shrink space on plane put our health and safeti in danger ?', "this week , a u. consum advisori group set up by the depart of transport said at a public hear that while the govern is happi to set standard for anim fli on plane , it doe n't stipul a minimum amount of space for human .", "in a world where anim have more right to space and food than human , ' said charli leocha , consum repres on the committe .", "it is time that the dot and faa take a stand for human treatment of passeng . '", 'but could crowd on plane lead to more serious issu than fight for space in the overhead locker , crash elbow and seat back kick ?', 'test conduct by the

In [10]:
model.encode(text)

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.