In [30]:
import tensorflow_datasets as tfds
import os
import numpy as np
from rouge_score import rouge_scorer
from sklearn.cluster import KMeans
import nltk
from nltk.stem.snowball import EnglishStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm.notebook import tqdm
from sknetwork.ranking import PageRank
from tqdm.notebook import tqdm
from itertools import combinations
from multiprocess import Pool
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [2]:
def word_tokenizer(text):
    return word_tokenize(text)

In [3]:
def get_sent_list(sents,stem=None):
    stemmer = EnglishStemmer()
    ans = []
    for sent in sents:
        words = word_tokenize(sent)
        word_stem = [stemmer.stem(w) for w in words]
        ans.append(" ".join(word_stem))
    return ans

In [4]:
def tensor_to_string(x):
    return x.numpy().decode('UTF-8')

In [43]:
def create_similarity_matrix_tr(sents):
    n = len(sents)
    M = np.zeros([n,n])
    
    A = {i: set(sent.split()) for i,sent in enumerate(sents)}

    for i in range(n):
        for j in range(i,n):
            if (len(sents[i]) != 1) and (len(sents[j]) != 1) :
                m = len(A[i].intersection(A[j])) / (np.log(len(sents[i])+ np.log(len(sents[j]))))
                
                M[i,j] = M[j,i] = m
    return M

def create_similarity_matrix_ps(sents,lambda_1=0.7,lambda_2=0.3):
    n = len(sents)
    M = np.zeros([n,n])
    
    A = {i: set(sent.split()) for i,sent in enumerate(sents)}
        
    for i in range(n):
        for j in range(i,n):
            
            if (len(sents[i]) != 1) and (len(sents[j]) != 1) :
                m = len(A[i].intersection(A[j])) / (np.log(len(sents[i])+ np.log(len(sents[j]))))
                if i == j:
                    continue
                
                M[i,j] = lambda_2 * m
                M[j,i] = lambda_1 * m
    return M

In [38]:
# Test set cnn-daily mail/arxiv/pubmed
def lead(text,k):
    return " ".join([text[i] for i in range(k)])

def text_rank(text,k):
    sents = get_sent_list(text,stem='EnglishStemmer')
    M = create_similarity_matrix_tr(sents)
    pr = PageRank()
    scores = pr.fit_transform(M)
    ind = np.argpartition(scores, -k)[-k:]
    return " ".join([text[i] for i in ind])

def pacsum(text,k):
    sents = get_sent_list(text,stem='EnglishStemmer')
    M = create_similarity_matrix_ps(sents)
    pr = PageRank()
    scores = pr.fit_transform(M)
    ind = np.argpartition(scores, -k)[-k:]
    return " ".join([text[i] for i in ind])

def _oracle_helper(inp):
    text,summ_sent = inp
    result = []
    for i in range(len(text)):
        scores = scorer.score(summ_sent,text[i])
        
    return 

def oracle(text,summary):
    best_combo = []  
    s = set()
    
    for sum_sent in summary:
        
        best_score = 0
        best_idx = 0
        
        for i in range(len(text)):
            if text[i] not in s:
                scores = scorer.score(sum_sent,text[i])
                if scores["rouge1"].fmeasure > best_score:
                    best_score = scores["rouge1"].fmeasure 
                    best_idx = i
        s.add(text[i])
        best_combo.append(best_idx)
          
    return " ".join([text[i] for i in best_combo])
        

In [39]:
def uml_summary(inp):
    kind,algo,x = inp
    if kind == "cnn_dailymail":
        key1 = 'article'
        key2 = 'highlights'
    elif kind == "scientific_papers/arxiv" or kind == "scientific_papers/pubmed":
        key1 = 'article'
        key2 = 'abstract'
        
    text = tensor_to_string(x[key1])
    summary = tensor_to_string(x[key2])
    
    summ_sents = sent_tokenize(summary)
    sents = sent_tokenize(text)
    
    if len(summ_sents) <= len(sents):
        k = len(summ_sents)
        
        if algo == 'lead':
            gen_sum = lead(sents,3 if len(sents) >= 3 else len(sents))
        elif algo == 'pacsum':
            gen_sum = pacsum(sents,k)
        elif algo == 'textrank':
            gen_sum = text_rank(sents,k)
        else:
            gen_sum = oracle(sents,summ_sents)
        scores = scorer.score(summary,gen_sum)
        return scores["rouge1"].fmeasure, scores["rouge2"].fmeasure, scores["rougeL"].fmeasure
    else:
        return None

In [45]:
datasets = ["cnn_dailymail","scientific_papers/arxiv","scientific_papers/pubmed"]
# datasets = ["scientific_papers/arxiv","scientific_papers/pubmed"]
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2','rougeL'], use_stemmer=True)
algos = ['lead','textrank','pacsum','oracle']
# algos = ['pacsum']
for algo in algos:
    print('Algo:',algo)
    for ds in datasets:
        train, val, test = tfds.load(name=ds, 
                          split=["train", "validation", "test"], 
                          data_dir="/mnt/disks/disk-1/data")

        dataset = list(test)
        args = zip([ds] * len(dataset),[algo] * len(dataset),dataset)

        with Pool(11) as pool:
              r = list(tqdm(pool.imap(uml_summary,args), total=len(dataset)))

        r = [a for a in r if a]
        r1, r2, rl = list(zip(*r))

        print('Dataset:',ds)
        print("Rouge 1 : ",np.round(np.mean(np.asarray(r1))*100,2))
        print("Rouge 2 : ",np.round(np.mean(np.asarray(r2))*100,2))
        print("Rouge L : ",np.round(np.mean(np.asarray(rl))*100,2))
        print("___")

Algo: lead


  0%|          | 0/11490 [00:00<?, ?it/s]

Dataset: cnn_dailymail
Rouge 1 :  40.06
Rouge 2 :  17.48
Rouge L :  25.02
___


  0%|          | 0/6440 [00:00<?, ?it/s]

Dataset: scientific_papers/arxiv
Rouge 1 :  27.41
Rouge 2 :  6.54
Rouge L :  15.99
___


  0%|          | 0/6658 [00:00<?, ?it/s]

Dataset: scientific_papers/pubmed
Rouge 1 :  27.86
Rouge 2 :  9.12
Rouge L :  17.14
___
Algo: textrank


  0%|          | 0/11490 [00:00<?, ?it/s]

Dataset: cnn_dailymail
Rouge 1 :  30.03
Rouge 2 :  10.66
Rouge L :  17.81
___


  0%|          | 0/6440 [00:00<?, ?it/s]

Dataset: scientific_papers/arxiv
Rouge 1 :  34.14
Rouge 2 :  10.13
Rouge L :  17.04
___


  0%|          | 0/6658 [00:00<?, ?it/s]

Dataset: scientific_papers/pubmed
Rouge 1 :  38.61
Rouge 2 :  14.21
Rouge L :  19.37
___
Algo: pacsum


  0%|          | 0/11490 [00:00<?, ?it/s]

Dataset: cnn_dailymail
Rouge 1 :  36.66
Rouge 2 :  15.89
Rouge L :  21.86
___


  0%|          | 0/6440 [00:00<?, ?it/s]

Dataset: scientific_papers/arxiv
Rouge 1 :  39.44
Rouge 2 :  12.26
Rouge L :  19.22
___


  0%|          | 0/6658 [00:00<?, ?it/s]

Dataset: scientific_papers/pubmed
Rouge 1 :  41.0
Rouge 2 :  15.61
Rouge L :  19.98
___
Algo: oracle


  0%|          | 0/11490 [00:00<?, ?it/s]

Dataset: cnn_dailymail
Rouge 1 :  52.37
Rouge 2 :  29.21
Rouge L :  42.28
___


  0%|          | 0/6440 [00:00<?, ?it/s]

Dataset: scientific_papers/arxiv
Rouge 1 :  58.19
Rouge 2 :  28.01
Rouge L :  39.38
___


  0%|          | 0/6658 [00:00<?, ?it/s]

Dataset: scientific_papers/pubmed
Rouge 1 :  59.13
Rouge 2 :  31.72
Rouge L :  41.75
___
