In [26]:
import json
import numpy as np

from utils import read_lines

In [27]:
PATH = json.load(open('../path_config.json'))

In [28]:
document_path = PATH['xsum_fariseq'] + '/test.source'
target_path = PATH['xsum_fariseq'] + '/test.target'
test_source = read_lines(document_path)
test_target = read_lines(target_path)
print(len(test_source))
assert len(test_source) == len(test_target)

11301


In [29]:
document_path = PATH['xsum_fariseq'] + '/train.source'
target_path = PATH['xsum_fariseq'] + '/train.target'
train_source = read_lines(document_path)
train_target = read_lines(target_path)
print(len(train_source))
assert len(train_source) == len(train_target)

203575


#### Inspection

In [30]:
sigma_entities = json.load(open('sigma_entities.json'))

In [31]:
sigma_entities[1]

{'start': 60,
 'end': 65,
 'label': 1,
 'type': 'GPE',
 'ent': 'Dutch',
 'bart.large': 0.306396484375,
 'xsum_cmlm_bos': 0.6650390625,
 'cnndm_cmlm_cedar': 0.9306640625,
 'bart.large.xsum': 0.4501953125,
 'cnndm_cmlm_scratch_cedar_warmup_20000': 0.003833770751953125,
 'xsum_cmlm_scratch_cedar_warmup_10000': 0.0004146099090576172,
 'cnndm_cmlm_scratch_cedar_warmup_10000': 0.0019969940185546875,
 'xsum_cmlm_scratch_cedar_warmup_20000': 0.0012483596801757812,
 'bart.large.cnn': 0.001880645751953125,
 'id': 416,
 'prior': 0.9306640625,
 'posterior': 0.6650390625}

In [32]:
test_source[922]

'The Times Educational Supplement (TES) says measures to guard against grade inflation were not appropriate for this year\'s English exams. Teaching unions have accused Ofqual of "regulatory failure" and say it is more evidence of flawed exam marking. But the regulator insists it applied its measures correctly. It comes as state and private school heads call for an independent inquiry into the problems. Last week Ofqual said it felt the way this year\'s English GCSE exams were graded was fair, despite the  grade boundaries being moved significantly part-way through the year. Teachers have complained that pupils achieving exactly the same marks would have received different grades depending on what time of year they sat the exam. And many who were predicted a grade C, obtained a grade D in their summer exams. Ofqual found that June\'s grade boundaries were correct, while January\'s were "too lenient". A key document sent to exam boards, which is on the Ofqual website, sets out how it ex

#### TF-IDF

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
sigma_source = []
for ent in sigma_entities:
    sigma_source.append(test_source[ent['id']])

In [35]:
print(len(sigma_source))

13


In [36]:
concat_corpus = []
concat_corpus.extend(sigma_source)
concat_corpus.extend(train_source)
print(len(concat_corpus))

203588


In [37]:
train_source_tfidf = TfidfVectorizer().fit_transform(concat_corpus)

In [38]:
train_source_tfidf.shape

(203588, 265348)

#### Calculate Similarity

In [39]:
sigma_vector = train_source_tfidf[0:len(sigma_source), :]
source_vector = train_source_tfidf[len(sigma_source):, :]

In [40]:
print(sigma_vector.shape)
print(source_vector.shape)

(13, 265348)
(203575, 265348)


In [41]:
similarity = source_vector * sigma_vector.T

#### Retrival Document

In [42]:
import torch

In [43]:
print(similarity.shape)

(203575, 13)


In [44]:
def count_entity(similarity_matrix, entities, k=3):
    """similarity_matrix: (203575, 18)"""
    similarity_matrix = torch.tensor(similarity_matrix.toarray())
    topk = torch.topk(similarity_matrix, k, dim=0)[1]  # [k, entity_number]: [10, 18]
    
    related_documents, related_summaries = [], []
    for i in range(topk.shape[1]):
        related_documents.append([])
        related_summaries.append([])
        for k in range(topk.shape[0]):
            related_documents[i].append(train_source[topk[k][i]])
            related_summaries[i].append(train_target[topk[k][i]])
    
    doc_counts, sum_counts = [], []
    for i in range(topk.shape[1]):
        source = ' '.join(related_documents[i])
        target = ' '.join(related_summaries[i])
        doc_counts.append(source.count(entities[i]['ent']))
        sum_counts.append(target.count(entities[i]['ent']))
    return doc_counts, sum_counts

In [45]:
doc_counts, sum_counts = count_entity(similarity, sigma_entities, k=10)

In [46]:
print(len(doc_counts))
print(doc_counts[:20])

13
[2, 1, 10, 0, 0, 4, 18, 0, 0, 0, 23, 2, 11]


In [47]:
print(len(sum_counts))
print(sum_counts[:20])

13
[9, 1, 9, 0, 4, 0, 6, 0, 0, 0, 3, 0, 5]


In [48]:
sum(doc_counts) / len(doc_counts)

5.461538461538462

In [49]:
sum(sum_counts) / len(sum_counts)

2.8461538461538463

In [50]:
sum([s > 0 for s in sum_counts]) / len(sum_counts)

0.5384615384615384