In [1]:
import tensorflow_datasets as tfds
import tensorflow as tf
import os
import struct
import hashlib
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig
import tensorflow_hub as hub
import tokenization
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer
from rouge_score import rouge_scorer
from sklearn.cluster import KMeans
import pickle
from sknetwork.ranking import PageRank

In [2]:
## Generate list of training/validation and test files

def hashhex(s):
    h = hashlib.sha1()
    h.update(s)
    return h.hexdigest()

def get_url_hashes(url_list):
    return [hashhex(url) for url in url_list]

def read_text_file(text_file):
    lines = []
    with open(text_file, "r") as f:
        for line in f:
            lines.append(line.strip().encode('utf-8'))
    return lines

url_file_train = 'CNN-DM/all_train.txt'
url_list_train = read_text_file(url_file_train)
url_hashes_train = get_url_hashes(url_list_train)
print(len(url_hashes_train))

url_file_val = 'CNN-DM/all_val.txt'
url_list_val = read_text_file(url_file_val)
url_hashes_val = get_url_hashes(url_list_val)
print(len(url_hashes_val))

# url_file_test = 'CNN-DM/all_test.txt'
# url_list_test = read_text_file(url_file_test)
# url_hashes_test = get_url_hashes(url_list_test)
# print(len(url_hashes_test))

287227
13368


In [3]:
## Line by line preprocessing (no tokenization)
def preprocess(x):
    return x.strip()

In [4]:
## Clean text, and split summary
def clean(x):
    text = []
    summary = []
    flag = 0
    for line in x:
        if line == '\n':
            continue
        if '@highlight' in line:
            flag = 1
            continue
        if flag == 0:
            text.append(preprocess(line))
        else:
            summary.append(preprocess(line))
    return text,summary

In [5]:
## Model features include an encode function -> takes a list of sentences. Returns a list of embeddings (all same dim)
model = SentenceTransformer('all-MiniLM-L6-v2')

In [6]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2','rougeL'], use_stemmer=True)

In [32]:
## V = list of embeddings. k = target size of summary
## Returns a list of sentence indices

def generate_summary(V, k):
    if k >= len(V):
        return list(range(len(V)))
    k -= 1
    centers = []
    cities = list(range(len(V)))
    centers.append(0)
    cities.remove(0)
    while k!= 0:
        city_dict = {}
        for cty in cities:
            min_dist = float("inf")
            for c in centers:
                min_dist = min(min_dist,np.linalg.norm(V[cty] - V[c]))
            city_dict[cty] = min_dist
        new_center = max(city_dict, key = lambda i: city_dict[i])
        centers.append(new_center)
        cities.remove(new_center)
        k -= 1
    return centers

In [36]:
## Pagerank version
def sim(a, b):
    return np.dot(a, b) / np.sqrt(np.dot(a, a) * np.dot(b, b))

def generate_summary(V,k):
    if k >= len(V):
        return list(range(len(V)))
    n = V.shape[0]
    adj = np.zeros((n, n))
    for i in range(n):
        adj[i][i] = sim(V[i],V[i])
        for j in range(i+1,n):
            s = sim(V[i], V[j])
            adj[i][j] = s
            adj[j][i] = s

    pr = PageRank()
    scores = pr.fit_transform(adj)
    ind = np.argpartition(scores, -k)[-k:]
    return np.sort(ind)

In [27]:
## Return R1, R2 and RL score for a text (using the generate_summary function)

def uml_summary(l):
    text,summary = clean(l)
    text_emb = model.encode(text)
    gen_sum = [text[x] for x in generate_summary(text_emb,len(summary))]
    scores = scorer.score(" ".join(summary)," ".join(gen_sum))
    return scores["rouge1"].fmeasure, scores["rouge2"].fmeasure, scores["rougeL"].fmeasure

In [10]:
%%time
# Don't run this
### all-MiniLM-L6-v2 with K-Centers
r1 = []
r2 = []
rl = []
for i in range(len(url_hashes_val)):
    if i>0 and i%1000 == 0:
        print(i,"Done")
    x = url_hashes_val[i]
    if os.path.exists(os.path.join("CNN-DM","raw","cnn","stories",x+".story")):
        l = open(os.path.join("CNN-DM","raw","cnn","stories",x+".story")).readlines()
    else:
        l = open(os.path.join("CNN-DM","raw","dailymail","stories",x+".story")).readlines()
    r1_val,r2_val,rl_val = uml_summary(l)
    r1.append(r1_val)
    r2.append(r2_val)
    rl.append(rl_val)

# (Uncomment to test for 1 line)
#     break
print("Rouge 1 : ",np.round(np.mean(np.asarray(r1))*100,2))
print("Rouge 2 : ",np.round(np.mean(np.asarray(r2))*100,2))
print("Rouge L : ",np.round(np.mean(np.asarray(rl))*100,2))

1000 Done
2000 Done
3000 Done
4000 Done
5000 Done
6000 Done
7000 Done
8000 Done
9000 Done
10000 Done
11000 Done
12000 Done
13000 Done
Rouge 1 :  31.15
Rouge 2 :  9.71
Rouge L :  18.79


In [37]:
%%time
# Don't run this
### all-MiniLM-L6-v2 with PageRank
r1 = []
r2 = []
rl = []
for i in range(len(url_hashes_val)):
    if i>0 and i%1000 == 0:
        print(i,"Done")
    x = url_hashes_val[i]
    if os.path.exists(os.path.join("CNN-DM","raw","cnn","stories",x+".story")):
        l = open(os.path.join("CNN-DM","raw","cnn","stories",x+".story")).readlines()
    else:
        l = open(os.path.join("CNN-DM","raw","dailymail","stories",x+".story")).readlines()
    r1_val,r2_val,rl_val = uml_summary(l)
    r1.append(r1_val)
    r2.append(r2_val)
    rl.append(rl_val)

# (Uncomment to test for 1 line)
#     break
print("Rouge 1 : ",np.round(np.mean(np.asarray(r1))*100,2))
print("Rouge 2 : ",np.round(np.mean(np.asarray(r2))*100,2))
print("Rouge L : ",np.round(np.mean(np.asarray(rl))*100,2))

1000 Done
2000 Done
3000 Done
4000 Done
5000 Done
6000 Done
7000 Done
8000 Done
9000 Done
10000 Done
11000 Done
12000 Done
13000 Done
Rouge 1 :  33.33
Rouge 2 :  12.82
Rouge L :  21.0
CPU times: user 2h 10min 37s, sys: 10min 12s, total: 2h 20min 50s
Wall time: 2h 20min 38s


In [39]:
print("".join(l))

A teacher and wrestling coach has been charged after allegedly having sex with a high school wrestler. 

Megan Blair Baker, 25, has been accused of sexual contact with a 17-year-old Sherwood Cass High School student amid reports that she had sex with him on a school bus in January. 

The teacher, from Urich, Missouri who is believed to be married, was indicted by grand jury on March 20. She was freed on $7,500 bond, KCTV5 reported. 

Scroll down for video 

Teacher and wrestling coach Megan Blair Baker, 25, has been accused of sexual contact with a 17-year-old Sherwood Cass High School student

According to members of the school community who spoke to the local TV station, the teenage boy 'had been bragging' about what happened with the teacher. 

The boy's father reported the incident to police on February 2. 

School superintendent Tim Gallagher told Daily Mail Online that Mrs Baker was placed on paid administrative leave when the allegations were first reported.  

She could face up

In [42]:
text,summary = clean(l)
print("\n".join(text))
print("___")
print("\n".join(summary))

A teacher and wrestling coach has been charged after allegedly having sex with a high school wrestler.
Megan Blair Baker, 25, has been accused of sexual contact with a 17-year-old Sherwood Cass High School student amid reports that she had sex with him on a school bus in January.
The teacher, from Urich, Missouri who is believed to be married, was indicted by grand jury on March 20. She was freed on $7,500 bond, KCTV5 reported.
Scroll down for video
Teacher and wrestling coach Megan Blair Baker, 25, has been accused of sexual contact with a 17-year-old Sherwood Cass High School student
According to members of the school community who spoke to the local TV station, the teenage boy 'had been bragging' about what happened with the teacher.
The boy's father reported the incident to police on February 2.
School superintendent Tim Gallagher told Daily Mail Online that Mrs Baker was placed on paid administrative leave when the allegations were first reported.
She could face up to four years i

In [43]:
text

['A teacher and wrestling coach has been charged after allegedly having sex with a high school wrestler.',
 'Megan Blair Baker, 25, has been accused of sexual contact with a 17-year-old Sherwood Cass High School student amid reports that she had sex with him on a school bus in January.',
 'The teacher, from Urich, Missouri who is believed to be married, was indicted by grand jury on March 20. She was freed on $7,500 bond, KCTV5\xa0reported.',
 'Scroll down for video',
 'Teacher and wrestling coach Megan Blair Baker, 25, has been accused of sexual contact with a 17-year-old Sherwood Cass High School student',
 "According to members of the school community who spoke to the local TV station, the teenage boy 'had been bragging' about what happened with the teacher.",
 "The boy's father reported the incident to police on February 2.",
 'School superintendent Tim Gallagher told Daily Mail Online that Mrs Baker was placed on paid administrative leave when the allegations were first reported.'