In [26]:
import tensorflow_datasets as tfds
import tensorflow as tf
import os
import struct
import hashlib
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig
import tensorflow_hub as hub
import tokenization
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer
from rouge_score import rouge_scorer
from sklearn.cluster import KMeans
import pickle
from sknetwork.ranking import PageRank
from nltk.stem.snowball import EnglishStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import torch
import sys

In [27]:
def tensor_to_string(x):
    return x.numpy().decode('UTF-8')

In [28]:
def get_sent_list(text,stem=None):
    sents = sent_tokenize(text)
    if stem == "None":
        return sents
    if stem == "EnglishStemmer":
        stemmer = EnglishStemmer()
    ans = []
    for sent in sents:
        words = word_tokenize(sent)
        word_stem = [stemmer.stem(w) for w in words]
        ans.append(str(" ".join(word_stem)))
    return ans

In [29]:
## Model features include an encode function -> takes a list of sentences. Returns a list of embeddings (all same dim)
transformers = ["paraphrase-albert-small-v2"]

models = dict()
device = torch.device("cpu")
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for trans in transformers:
    models[trans] = SentenceTransformer(trans,cache_folder='/mnt/disks/disk-1/data/models')
    models[trans]._target_device = device

In [30]:
## V = list of embeddings. k = target size of summary
## Returns a list of sentence indices

def generate_summary(V, k):
    if k >= len(V):
        return list(range(len(V)))
    k -= 1
    centers = []
    cities = list(range(len(V)))
    centers.append(0)
    cities.remove(0)
    while k!= 0:
        city_dict = {}
        for cty in cities:
            min_dist = float("inf")
            for c in centers:
                min_dist = min(min_dist,np.linalg.norm(V[cty] - V[c]))
            city_dict[cty] = min_dist
        new_center = max(city_dict, key = lambda i: city_dict[i])
        centers.append(new_center)
        cities.remove(new_center)
        k -= 1
    return centers

In [None]:
## Pagerank version

def sim(a, b):
    return np.dot(a, b) / np.sqrt(np.dot(a, a) * np.dot(b, b))

def generate_summary(V,k):
    if k >= len(V):
        return list(range(len(V)))
    n = V.shape[0]
    adj = np.zeros((n, n))
    for i in range(n):
        adj[i][i] = sim(V[i],V[i])
        for j in range(i+1,n):
            s = sim(V[i], V[j])
            adj[i][j] = s
            adj[j][i] = s

    pr = PageRank()
    scores = pr.fit_transform(adj)
    ind = np.argpartition(scores, -k)[-k:]
    return np.sort(ind)

In [31]:
def uml_summary(x,index,kind="cnn_dailymail",model="all-MiniLM-L6-v2"):
    if kind == "cnn_dailymail":
        key1 = 'article'
        key2 = 'highlights'
    elif kind == "scientific_papers/arxiv" or kind == "scientific_papers/pubmed":
        key1 = 'article'
        key2 = 'abstract'
        
    stemmer = "EnglishStemmer"
    text = tensor_to_string(x[key1])
    text = get_sent_list(text,stemmer)
    summary = tensor_to_string(x[key2])
    summary = get_sent_list(summary,stemmer)
    
    filename = str(index) + "_" + model + ".pickle"
    folderpath = os.path.join("/mnt/disks/disk-1/data/pickle",kind)
    filepath = os.path.join("/mnt/disks/disk-1/data/pickle",kind,filename)
    
    text_emb = models[model].encode(text)
#     if os.path.exists(filepath):
#         with open(filepath, 'rb') as handle:
#             text_emb = pickle.load(handle)
#     else:
#         text_emb = models[model].encode(text)
#         with open(filepath, 'wb') as handle:
#             pickle.dump(text_emb, handle, protocol=pickle.HIGHEST_PROTOCOL)

    gen_sum = [text[x] for x in generate_summary(text_emb,len(summary))]
    scores = scorer.score(" ".join(summary)," ".join(gen_sum))
    return scores["rouge1"].fmeasure, scores["rouge2"].fmeasure, scores["rougeL"].fmeasure

In [24]:
%%time

# datasets = ["cnn_dailymail","scientific_papers/arxiv","scientific_papers/pubmed"]
datasets = ["cnn_dailymail"]
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2','rougeL'], use_stemmer=True)
for ds in datasets:
    for trans in transformers:
        train, val, test = tfds.load(name=ds, 
                              split=["train", "validation", "test"], 
                              data_dir="/mnt/disks/disk-1/data")
        
#         model = SentenceTransformer(trans,cache_folder='/mnt/disks/disk-1/data/models')
#         model._target_device = device
        r1 = []
        r2 = []
        rl = []
        index = 0
        for x in list(test):
            r1_val,r2_val,rl_val = uml_summary(x,index,kind=ds,model=trans)
            index += 1
            r1.append(r1_val)
            r2.append(r2_val)
            rl.append(rl_val)
            print(index)
            if index > 100:
                print(index)
                break
        print(ds,trans)
        print(index)
        print("Rouge 1 : ",np.round(np.mean(np.asarray(r1))*100,2))
        print("Rouge 2 : ",np.round(np.mean(np.asarray(r2))*100,2))
        print("Rouge L : ",np.round(np.mean(np.asarray(rl))*100,2))
        print("___")

INFO:absl:Load dataset info from /mnt/disks/disk-1/data/scientific_papers/arxiv/1.1.1
INFO:absl:Reusing dataset scientific_papers (/mnt/disks/disk-1/data/scientific_papers/arxiv/1.1.1)
INFO:absl:Constructing tf.data.Dataset scientific_papers for split ['train', 'validation', 'test'], from /mnt/disks/disk-1/data/scientific_papers/arxiv/1.1.1


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
101
scientific_papers/arxiv paraphrase-albert-small-v2
101
Rouge 1 :  19.56
Rouge 2 :  4.37
Rouge L :  12.77
___
CPU times: user 27min 9s, sys: 1min 44s, total: 28min 54s
Wall time: 5min 1s


In [25]:
kind="scientific_papers/arxiv"
model="paraphrase-albert-small-v2"
if kind == "cnn_dailymail":
    key1 = 'article'
    key2 = 'highlights'
elif kind == "scientific_papers/arxiv" or kind == "scientific_papers/pubmed":
    key1 = 'article'
    key2 = 'abstract'

stemmer = "EnglishStemmer"
text = tensor_to_string(x[key1])
text = get_sent_list(text,stemmer)
summary = tensor_to_string(x[key2])
summary = get_sent_list(summary,stemmer)

text_emb = models[model].encode(text)

gen_sum = [text[x] for x in generate_summary(text_emb,len(summary))]
scores = scorer.score(" ".join(summary)," ".join(gen_sum))
print(scores["rouge1"].fmeasure, scores["rouge2"].fmeasure, scores["rougeL"].fmeasure)

0.3402489626556016 0.058577405857740586 0.1908713692946058


In [26]:
gen_sum

['linear wave function in quantum chemistri are fundament limit by their inabl to compact express wave function in strong correl regim , a difficulti that aris direct from the factori growth of hilbert space in the quantum mani - bodi problem .',
 'what if the plane from one bond intersect a far - away atom ?',
 'the nodal surfac of @ xmath38 is a plane center at @ xmath39 and normal to the unit vector @ xmath40 , while the nodal surfac of @ xmath41 is an ellipsoid with center @ xmath39 and axe defin by the eigenvector and eigenvalu of @ xmath42 .']

In [27]:
summary

['we demonstr that 4-bodi real space jastrow factor are , with the right type of jastrow basi function , capabl of perform success wave function stencil to remov unwant ionic term from an overabund fermion refer without unduli modifi the remain compon .',
 'in addit to great improv size consist ( restor it exact in the case of a gemin power ) , real - space wave function stencil is , unlik it hilbert space predecessor , immedi compat with diffus mont carlo , allow it to be use in the pursuit of compact , strong correl trial function with reliabl nodal surfac .',
 'we demonstr the efficaci of this approach in the context of a doubl bond dissoci by use it to extract a qualit correct nodal surfac despit be pair with a restrict slater determin , that , due to ionic term error , produc a ground state with a qualit incorrect nodal surfac when use in the absenc of the jastrow .']

In [15]:
kind="scientific_papers/arxiv"
model="paraphrase-albert-small-v2"
if kind == "cnn_dailymail":
    key1 = 'article'
    key2 = 'highlights'
elif kind == "scientific_papers/arxiv" or kind == "scientific_papers/pubmed":
    key1 = 'article'
    key2 = 'abstract'

stemmer = "EnglishStemmer"
text = tensor_to_string(x[key1])
text = get_sent_list(text,stemmer)
summary = tensor_to_string(x[key2])
summary = get_sent_list(summary,stemmer)

text_emb = models[model].encode(text)

gen_sum = [text[x] for x in generate_summary(text_emb,len(summary))]
scores = scorer.score(" ".join(summary)," ".join(gen_sum))
print(scores["rouge1"].fmeasure, scores["rouge2"].fmeasure, scores["rougeL"].fmeasure)

0.21212121212121213 0.03125 0.1212121212121212


In [16]:
gen_sum

['a mother-of-two lost more than ten stone after she becam concern that her size was caus her to look like a man .',
 'snack : crisp .',
 'dinner : chicken stir-fri .']

In [17]:
summary

['kim callaghan , from ireland , pile on the pound after have children .',
 'limit to size 28 cloth kim , 39 , worri she resembl a man .',
 'she join slim world and drop ten dress size as well as 10st .']

In [2]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/mnt/disks/disk-1/data/models'

In [7]:
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn',cache_dir='/mnt/disks/disk-1/data/models')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn',cache_dir='/mnt/disks/disk-1/data/models')

ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, truncation=True,return_tensors='tf')

# Generate Summary
# summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
# print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])


In [14]:
from transformers import TFBartModel,FlaxBartForConditionalGeneration,BartTokenizer
model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large',cache_dir='/mnt/disks/disk-1/data/models')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large',cache_dir='/mnt/disks/disk-1/data/models')

Downloading:   0%|          | 0.00/775M [00:00<?, ?B/s]

INFO:absl:Unable to initialize backend 'tpu_driver': NOT_FOUND: Unable to find driver in registry given worker: 
INFO:absl:Unable to initialize backend 'gpu': NOT_FOUND: Could not find registered platform with name: "cuda". Available platform names are: Interpreter Host
INFO:absl:Unable to initialize backend 'tpu': INVALID_ARGUMENT: TpuPlatform is not available.
Some weights of FlaxBartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large and are newly initialized: {('final_logits_bias',)}
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
text = ["My friends are cool but they eat too many carbs.","My friends are cool but they eat too many carbs."]
inputs = tokenizer(text, max_length=1024, return_tensors='jax')
encoder_outputs = model.encode(**inputs)

In [36]:
x = list(encoder_outputs.last_hidden_state)
x[0].shape

(13, 1024)

In [41]:
model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large',cache_dir='/mnt/disks/disk-1/data/models')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large',cache_dir='/mnt/disks/disk-1/data/models')

def uml_summary2(x,index,kind="cnn_dailymail"):
    if kind == "cnn_dailymail":
        key1 = 'article'
        key2 = 'highlights'
    elif kind == "scientific_papers/arxiv" or kind == "scientific_papers/pubmed":
        key1 = 'article'
        key2 = 'abstract'
        
    stemmer = "EnglishStemmer"
    text = tensor_to_string(x[key1])
    text = get_sent_list(text,stemmer)
    summary = tensor_to_string(x[key2])
    summary = get_sent_list(summary,stemmer)
    
    inputs = tokenizer(text, max_length=1024, return_tensors='jax',padding=True)
    text_emb = model.encode(**inputs)
    text_emb = list(model.encode(**inputs).last_hidden_state)
#     if os.path.exists(filepath):
#         with open(filepath, 'rb') as handle:
#             text_emb = pickle.load(handle)
#     else:
#         text_emb = models[model].encode(text)
#         with open(filepath, 'wb') as handle:
#             pickle.dump(text_emb, handle, protocol=pickle.HIGHEST_PROTOCOL)

    gen_sum = [text[x] for x in generate_summary(text_emb,len(summary))]
    scores = scorer.score(" ".join(summary)," ".join(gen_sum))
    return scores["rouge1"].fmeasure, scores["rouge2"].fmeasure, scores["rougeL"].fmeasure

Some weights of FlaxBartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large and are newly initialized: {('final_logits_bias',)}
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
%%time

# datasets = ["cnn_dailymail","scientific_papers/arxiv","scientific_papers/pubmed"]
datasets = ["cnn_dailymail"]
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2','rougeL'], use_stemmer=True)
for ds in datasets:
    for trans in transformers:
        train, val, test = tfds.load(name=ds, 
                              split=["train", "validation", "test"], 
                              data_dir="/mnt/disks/disk-1/data")
        
#         model = SentenceTransformer(trans,cache_folder='/mnt/disks/disk-1/data/models')
#         model._target_device = device
        r1 = []
        r2 = []
        rl = []
        index = 0
        for x in list(test):
            r1_val,r2_val,rl_val = uml_summary2(x,index,kind=ds)
            index += 1
            r1.append(r1_val)
            r2.append(r2_val)
            rl.append(rl_val)
            print(index)
            if index > 100:
                print(index)
                break
        print(ds,trans)
        print(index)
        print("Rouge 1 : ",np.round(np.mean(np.asarray(r1))*100,2))
        print("Rouge 2 : ",np.round(np.mean(np.asarray(r2))*100,2))
        print("Rouge L : ",np.round(np.mean(np.asarray(rl))*100,2))
        print("___")

INFO:absl:Load dataset info from /mnt/disks/disk-1/data/cnn_dailymail/3.1.0
INFO:absl:Reusing dataset cnn_dailymail (/mnt/disks/disk-1/data/cnn_dailymail/3.1.0)
INFO:absl:Constructing tf.data.Dataset cnn_dailymail for split ['train', 'validation', 'test'], from /mnt/disks/disk-1/data/cnn_dailymail/3.1.0


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
101
cnn_dailymail paraphrase-albert-small-v2
101
Rouge 1 :  30.06
Rouge 2 :  10.52
Rouge L :  17.89
___
CPU times: user 49min 16s, sys: 16.3 s, total: 49min 33s
Wall time: 15min 2s


In [46]:
%%time

# datasets = ["cnn_dailymail","scientific_papers/arxiv","scientific_papers/pubmed"]
datasets = ["cnn_dailymail"]
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2','rougeL'], use_stemmer=True)
for ds in datasets:
    for trans in transformers:
        train, val, test = tfds.load(name=ds, 
                              split=["train", "validation", "test"], 
                              data_dir="/mnt/disks/disk-1/data")
        
#         model = SentenceTransformer(trans,cache_folder='/mnt/disks/disk-1/data/models')
#         model._target_device = device
        r1 = []
        r2 = []
        rl = []
        index = 0
        for x in list(test):
            r1_val,r2_val,rl_val = uml_summary(x,index,kind=ds,model=trans)
            index += 1
            r1.append(r1_val)
            r2.append(r2_val)
            rl.append(rl_val)
            print(index)
            if index > 100:
                print(index)
                break
        print(ds,trans)
        print(index)
        print("Rouge 1 : ",np.round(np.mean(np.asarray(r1))*100,2))
        print("Rouge 2 : ",np.round(np.mean(np.asarray(r2))*100,2))
        print("Rouge L : ",np.round(np.mean(np.asarray(rl))*100,2))
        print("___")

INFO:absl:Load dataset info from /mnt/disks/disk-1/data/cnn_dailymail/3.1.0
INFO:absl:Reusing dataset cnn_dailymail (/mnt/disks/disk-1/data/cnn_dailymail/3.1.0)
INFO:absl:Constructing tf.data.Dataset cnn_dailymail for split ['train', 'validation', 'test'], from /mnt/disks/disk-1/data/cnn_dailymail/3.1.0


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
101
cnn_dailymail paraphrase-albert-small-v2
101
Rouge 1 :  32.21
Rouge 2 :  10.72
Rouge L :  20.08
___
CPU times: user 3min 27s, sys: 1.31 s, total: 3min 28s
Wall time: 36.9 s


In [47]:
len(list(test))

11490

In [48]:
models[trans].fit()

TypeError: fit() missing 1 required positional argument: 'train_objectives'