In [1]:
import tensorflow_datasets as tfds
import tensorflow as tf
import os
import struct
import hashlib
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig
import tensorflow_hub as hub
import tokenization
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer
from rouge_score import rouge_scorer
from sklearn.cluster import KMeans

In [2]:
## Generate list of training/validation and test files

def hashhex(s):
    h = hashlib.sha1()
    h.update(s)
    return h.hexdigest()

def get_url_hashes(url_list):
    return [hashhex(url) for url in url_list]

def read_text_file(text_file):
    lines = []
    with open(text_file, "r") as f:
        for line in f:
            lines.append(line.strip().encode('utf-8'))
    return lines

url_file_train = 'CNN-DM/all_train.txt'
url_list_train = read_text_file(url_file_train)
url_hashes_train = get_url_hashes(url_list_train)
print(len(url_hashes_train))

url_file_val = 'CNN-DM/all_val.txt'
url_list_val = read_text_file(url_file_val)
url_hashes_val = get_url_hashes(url_list_val)
print(len(url_hashes_val))

url_file_test = 'CNN-DM/all_test.txt'
url_list_test = read_text_file(url_file_test)
url_hashes_test = get_url_hashes(url_list_test)
print(len(url_hashes_test))

287227
13368
11490


In [3]:
## Line by line preprocessing (no tokenization)
def preprocess(x):
    return x.strip()

In [4]:
## Clean text, and split summary
def clean(x):
    text = []
    summary = []
    flag = 0
    for line in x:
        if line == '\n':
            continue
        if '@highlight' in line:
            flag = 1
            continue
        if flag == 0:
            text.append(preprocess(line))
        else:
            summary.append(preprocess(line))
    return text,summary

In [5]:
## Model features include an encode function -> takes a list of sentences. Returns a list of embeddings (all same dim)
model = SentenceTransformer('all-MiniLM-L6-v2')

In [6]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2','rougeL'], use_stemmer=True)

In [7]:
## V = list of embeddings. k = target size of summary
## Returns a list of sentence indices

def generate_summary(V, k):
    k -= 1
    centers = []
    cities = list(range(len(V)))
    centers.append(0)
    cities.remove(0)
    while k!= 0:
        city_dict = {}
        for cty in cities:
            min_dist = float("inf")
            for c in centers:
                min_dist = min(min_dist,np.linalg.norm(V[cty] - V[c]))
            city_dict[cty] = min_dist
        new_center = max(city_dict, key = lambda i: city_dict[i])
        centers.append(new_center)
        cities.remove(new_center)
        k -= 1
    return centers

In [12]:
## Return R1, R2 and RL score for a text (using the generate_summary function)

def uml_summary(l):
    text,summary = clean(l)
    text_emb = model.encode(text)
    gen_sum = [text[x] for x in generate_summary(text_emb,len(summary))]
    scores = scorer.score(" ".join(summary)," ".join(gen_sum))
    return scores["rouge1"].fmeasure, scores["rouge2"].fmeasure, scores["rougeL"].fmeasure

In [13]:
r1 = []
r2 = []
rl = []
for i in range(len(url_hashes_test)):
    if i>0 and i%100 == 0:
        print(i,"Done")
    x = url_hashes_test[i]
    if os.path.exists(os.path.join("CNN-DM","raw","cnn","stories",x+".story")):
        l = open(os.path.join("CNN-DM","raw","cnn","stories",x+".story")).readlines()
    else:
        l = open(os.path.join("CNN-DM","raw","dm","stories",x+".story")).readlines()
    r1_val,r2_val,rl_val = uml_summary(l)
    r1.append(r1_val)
    r2.append(r2_val)
    rl.append(rl_val)
print("Rouge 1 : ",np.round(np.mean(np.asarray(r1))*100,2))
print("Rouge 2 : ",np.round(np.mean(np.asarray(r2))*100,2))
print("Rouge L : ",np.round(np.mean(np.asarray(rl))*100,2))

Rouge 1 :  30.36
Rouge 2 :  11.64
Rouge L :  22.03
