In [30]:
import json
import numpy as np

from utils import read_lines

In [31]:
PATH = json.load(open('../path_config.json'))

In [32]:
document_path = PATH['xsum_fariseq'] + '/test.source'
target_path = PATH['xsum_fariseq'] + '/test.target'
test_source = read_lines(document_path)
test_target = read_lines(target_path)
print(len(test_source))
assert len(test_source) == len(test_target)

11301


In [33]:
document_path = PATH['cnndm_fariseq'] + '/train.source'
target_path = PATH['cnndm_fariseq'] + '/train.target'
train_source = read_lines(document_path)
train_target = read_lines(target_path)
print(len(train_source))
assert len(train_source) == len(train_target)

287227


#### Inspection

In [34]:
sigma_entities = json.load(open('sigma_entities.json'))

In [35]:
sigma_entities[1]

{'start': 60,
 'end': 65,
 'label': 1,
 'type': 'GPE',
 'ent': 'Dutch',
 'bart.large': 0.306396484375,
 'xsum_cmlm_bos': 0.6650390625,
 'cnndm_cmlm_cedar': 0.9306640625,
 'bart.large.xsum': 0.4501953125,
 'cnndm_cmlm_scratch_cedar_warmup_20000': 0.003833770751953125,
 'xsum_cmlm_scratch_cedar_warmup_10000': 0.0004146099090576172,
 'cnndm_cmlm_scratch_cedar_warmup_10000': 0.0019969940185546875,
 'xsum_cmlm_scratch_cedar_warmup_20000': 0.0012483596801757812,
 'bart.large.cnn': 0.001880645751953125,
 'id': 416,
 'prior': 0.9306640625,
 'posterior': 0.6650390625}

In [36]:
test_source[922]

'The Times Educational Supplement (TES) says measures to guard against grade inflation were not appropriate for this year\'s English exams. Teaching unions have accused Ofqual of "regulatory failure" and say it is more evidence of flawed exam marking. But the regulator insists it applied its measures correctly. It comes as state and private school heads call for an independent inquiry into the problems. Last week Ofqual said it felt the way this year\'s English GCSE exams were graded was fair, despite the  grade boundaries being moved significantly part-way through the year. Teachers have complained that pupils achieving exactly the same marks would have received different grades depending on what time of year they sat the exam. And many who were predicted a grade C, obtained a grade D in their summer exams. Ofqual found that June\'s grade boundaries were correct, while January\'s were "too lenient". A key document sent to exam boards, which is on the Ofqual website, sets out how it ex

#### TF-IDF

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
sigma_source = []
for ent in sigma_entities:
    sigma_source.append(test_source[ent['id']])

In [39]:
print(len(sigma_source))

13


In [40]:
concat_corpus = []
concat_corpus.extend(sigma_source)
concat_corpus.extend(train_source)
print(len(concat_corpus))

287240


In [41]:
train_source_tfidf = TfidfVectorizer().fit_transform(concat_corpus)

In [42]:
train_source_tfidf.shape

(287240, 480781)

#### Calculate Similarity

In [43]:
sigma_vector = train_source_tfidf[0:len(sigma_source), :]
source_vector = train_source_tfidf[len(sigma_source):, :]

In [44]:
print(sigma_vector.shape)
print(source_vector.shape)

(13, 480781)
(287227, 480781)


In [45]:
def get_max_similarity(source_vector, target_vector):
    similarity = source_vector * target_vector.T
    print(similarity.shape)
    max_similarity = np.max(similarity, axis=0)
    return max_similarity

In [46]:
max_similarity = get_max_similarity(source_vector, sigma_vector)

(287227, 13)


In [47]:
np.average(max_similarity.toarray()[0])

0.4141226566172572

In [48]:
for threshold in [0.1, 0.2, 0.3, 0.4, 0.5]:
    print('threshold: {}'.format(threshold))
    related = sigma_vector.toarray() > threshold
    print('- left: {}'.format(np.sum(related, axis=1)))
    print('- left: {}'.format(np.sum(related)))
    print()

threshold: 0.1
- left: [24 18 17 24 25 18 22 22 30 18 18 24  7]
- left: 267

threshold: 0.2
- left: [2 7 5 5 8 5 3 3 4 3 5 5 3]
- left: 58

threshold: 0.3
- left: [1 2 0 2 2 1 2 2 2 2 1 1 2]
- left: 20

threshold: 0.4
- left: [1 1 0 0 0 1 1 1 0 1 1 0 1]
- left: 8

threshold: 0.5
- left: [0 0 0 0 0 0 1 1 0 0 0 0 1]
- left: 3



#### Entity Count

In [49]:
import torch

In [50]:
similarity = source_vector * sigma_vector.T

In [51]:
similarity.shape

(287227, 13)

In [52]:
def count_entity(similarity_matrix, entities, k=3):
    """similarity_matrix: (287227, 18)"""
    similarity_matrix = torch.tensor(similarity_matrix.toarray())
    topk = torch.topk(similarity_matrix, k, dim=0)[1]  # [k, entity_number]: [10, 18]
    
    related_documents, related_summaries = [], []
    for i in range(topk.shape[1]):
        related_documents.append([])
        related_summaries.append([])
        for k in range(topk.shape[0]):
            related_documents[i].append(train_source[topk[k][i]])
            related_summaries[i].append(train_target[topk[k][i]])
    
    doc_counts, sum_counts = [], []
    for i in range(topk.shape[1]):
        source = ' '.join(related_documents[i])
        target = ' '.join(related_summaries[i])
        doc_counts.append(source.count(entities[i]['ent']))
        sum_counts.append(target.count(entities[i]['ent']))
    return doc_counts, sum_counts

In [53]:
doc_counts, sum_counts = count_entity(similarity, sigma_entities, k=10)

In [54]:
print(len(doc_counts))
print(doc_counts[:20])

13
[30, 4, 37, 1, 4, 3, 24, 0, 0, 0, 31, 8, 24]


In [55]:
print(len(sum_counts))
print(sum_counts[:20])

13
[9, 0, 8, 0, 0, 0, 7, 0, 0, 0, 1, 2, 5]


In [56]:
sum(doc_counts) / len(doc_counts)

12.76923076923077

In [57]:
sum(sum_counts) / len(sum_counts)

2.4615384615384617

In [58]:
sum([s > 0 for s in sum_counts]) / len(sum_counts)

0.46153846153846156