In [1]:
import tensorflow_datasets as tfds
import tensorflow as tf
import os
import struct
import hashlib
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig
import tensorflow_hub as hub
import tokenization
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer
from rouge_score import rouge_scorer
from sklearn.cluster import KMeans
import pickle

In [9]:
## Generate list of training/validation and test files

def hashhex(s):
    h = hashlib.sha1()
    h.update(s)
    return h.hexdigest()

def get_url_hashes(url_list):
    return [hashhex(url) for url in url_list]

def read_text_file(text_file):
    lines = []
    with open(text_file, "r") as f:
        for line in f:
            lines.append(line.strip().encode('utf-8'))
    return lines

url_file_train = 'CNN-DM/all_train.txt'
url_list_train = read_text_file(url_file_train)
url_hashes_train = get_url_hashes(url_list_train)
print(len(url_hashes_train))

url_file_val = 'CNN-DM/all_val.txt'
url_list_val = read_text_file(url_file_val)
url_hashes_val = get_url_hashes(url_list_val)
print(len(url_hashes_val))

url_file_test = 'CNN-DM/all_test.txt'
url_list_test = read_text_file(url_file_test)
url_hashes_test = get_url_hashes(url_list_test)
print(len(url_hashes_test))

287227
13368
11490


In [10]:
## Line by line preprocessing (no tokenization)
def preprocess(x):
    return x.strip()

In [4]:
## Clean text, and split summary
def clean(x):
    text = []
    summary = []
    flag = 0
    for line in x:
        if line == '\n':
            continue
        if '@highlight' in line:
            flag = 1
            continue
        if flag == 0:
            text.append(preprocess(line))
        else:
            summary.append(preprocess(line))
    return text,summary

In [5]:
## Model features include an encode function -> takes a list of sentences. Returns a list of embeddings (all same dim)
model = SentenceTransformer('all-MiniLM-L6-v2')

In [6]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2','rougeL'], use_stemmer=True)

In [7]:
## V = list of embeddings. k = target size of summary
## Returns a list of sentence indices

def generate_summary(V, k):
    if k >= len(V):
        return list(range(len(V)))
    k -= 1
    centers = []
    cities = list(range(len(V)))
    centers.append(0)
    cities.remove(0)
    while k!= 0:
        city_dict = {}
        for cty in cities:
            min_dist = float("inf")
            for c in centers:
                min_dist = min(min_dist,np.linalg.norm(V[cty] - V[c]))
            city_dict[cty] = min_dist
        new_center = max(city_dict, key = lambda i: city_dict[i])
        centers.append(new_center)
        cities.remove(new_center)
        k -= 1
    return centers

In [8]:
## Return R1, R2 and RL score for a text (using the generate_summary function)

def uml_summary(l):
    text,summary = clean(l)
    text_emb = model.encode(text)
    gen_sum = [text[x] for x in generate_summary(text_emb,len(summary))]
    scores = scorer.score(" ".join(summary)," ".join(gen_sum))
    return scores["rouge1"].fmeasure, scores["rouge2"].fmeasure, scores["rougeL"].fmeasure

In [27]:
# Don't run this

r1 = []
r2 = []
rl = []
for i in range(len(url_hashes_test)):
    if i>0 and i%100 == 0:
        print(i,"Done")
    x = url_hashes_test[i]
    if os.path.exists(os.path.join("CNN-DM","raw","cnn","stories",x+".story")):
        l = open(os.path.join("CNN-DM","raw","cnn","stories",x+".story")).readlines()
    else:
        l = open(os.path.join("CNN-DM","raw","dailymail","stories",x+".story")).readlines()
    r1_val,r2_val,rl_val = uml_summary(l)
    r1.append(r1_val)
    r2.append(r2_val)
    rl.append(rl_val)
print("Rouge 1 : ",np.round(np.mean(np.asarray(r1))*100,2))
print("Rouge 2 : ",np.round(np.mean(np.asarray(r2))*100,2))
print("Rouge L : ",np.round(np.mean(np.asarray(rl))*100,2))

100 Done
200 Done
300 Done
400 Done
500 Done
600 Done
700 Done
800 Done
900 Done
1000 Done
1100 Done
1200 Done
1300 Done
1400 Done
1500 Done
1600 Done
1700 Done
1800 Done
1900 Done
2000 Done
2100 Done
2200 Done
2300 Done
2400 Done
2500 Done
2600 Done
2700 Done
2800 Done
2900 Done
3000 Done
3100 Done
3200 Done
3300 Done
3400 Done
3500 Done
3600 Done
3700 Done
3800 Done
3900 Done
4000 Done
4100 Done
4200 Done
4300 Done
4400 Done
4500 Done
4600 Done
4700 Done
4800 Done
4900 Done
5000 Done
5100 Done
5200 Done
5300 Done
5400 Done
5500 Done
5600 Done
5700 Done
5800 Done
5900 Done
6000 Done
6100 Done
6200 Done
6300 Done
6400 Done
6500 Done
6600 Done
6700 Done
6800 Done
6900 Done
7000 Done
7100 Done
7200 Done
7300 Done
7400 Done
7500 Done
7600 Done
7700 Done
7800 Done
7900 Done
8000 Done
8100 Done
8200 Done
8300 Done
8400 Done
8500 Done
8600 Done
8700 Done
8800 Done
8900 Done
9000 Done
9100 Done
9200 Done
9300 Done
9400 Done
9500 Done
9600 Done
9700 Done
9800 Done
9900 Done
10000 Done
10100 Do

In [None]:
x = url_hashes_test[0]
l = open(os.path.join("CNN-DM","raw","cnn","stories",x+".story")).readlines()
text,summary = clean(l)
text_embeddings = model.encode(text)
summary_embeddings = model.encode(summary)

kmeans = KMeans(n_clusters=len(summary_embeddings)).fit(text_embeddings)
min_dist = [float('inf')] * len(summary_embeddings)
min_node = [-1] * len(summary_embeddings)

for i in range(len(text_embeddings)):
    cls = kmeans.labels_[i]
    dist = np.linalg.norm(text_embeddings[i] - kmeans.cluster_centers_[cls])
    if dist < min_dist[cls]:
        min_node[cls] = i
        min_dist[cls] = dist
gen_summary = [text[x] for x in min_node]