In [1]:
import json
import numpy as np

from utils import read_lines

In [2]:
PATH = json.load(open('path_config.json'))

In [3]:
document_path = PATH['xsum_fariseq'] + '/test.source'
target_path = PATH['xsum_fariseq'] + '/test.target'
test_source = read_lines(document_path)
test_target = read_lines(target_path)
print(len(test_source))
assert len(test_source) == len(test_target)

11301


In [4]:
document_path = PATH['cnndm_fariseq'] + '/train.source'
target_path = PATH['cnndm_fariseq'] + '/train.target'
train_source = read_lines(document_path)
train_target = read_lines(target_path)
print(len(train_source))
assert len(train_source) == len(train_target)

287227


#### Inspection

In [5]:
right_entities = json.load(open('right_entities.json'))
left_entities = json.load(open('left_entities.json'))

In [6]:
left_entities[1]

{'start': 74,
 'end': 79,
 'label': 1,
 'type': 'PERSON',
 'ent': 'David',
 'prior': 0.00992584228515625,
 'posterior': 0.94140625,
 'id': 1513}

In [7]:
test_source[922]

'The Times Educational Supplement (TES) says measures to guard against grade inflation were not appropriate for this year\'s English exams. Teaching unions have accused Ofqual of "regulatory failure" and say it is more evidence of flawed exam marking. But the regulator insists it applied its measures correctly. It comes as state and private school heads call for an independent inquiry into the problems. Last week Ofqual said it felt the way this year\'s English GCSE exams were graded was fair, despite the  grade boundaries being moved significantly part-way through the year. Teachers have complained that pupils achieving exactly the same marks would have received different grades depending on what time of year they sat the exam. And many who were predicted a grade C, obtained a grade D in their summer exams. Ofqual found that June\'s grade boundaries were correct, while January\'s were "too lenient". A key document sent to exam boards, which is on the Ofqual website, sets out how it ex

#### TF-IDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
left_source = []
for ent in left_entities:
    left_source.append(test_source[ent['id']])

In [10]:
right_source = []
for ent in right_entities:
    right_source.append(test_source[ent['id']])

In [11]:
print(len(left_source))

18


In [12]:
print(len(right_source))

18


In [13]:
concat_corpus = []
concat_corpus.extend(left_source)
concat_corpus.extend(right_source)
concat_corpus.extend(train_source)
print(len(concat_corpus))

287263


In [14]:
train_source_tfidf = TfidfVectorizer().fit_transform(concat_corpus)

In [15]:
train_source_tfidf.shape

(287263, 480785)

#### Calculate Similarity

In [16]:
left_vector = train_source_tfidf[0: len(left_source), :]
right_vector = train_source_tfidf[len(left_source): len(left_source) + len(right_source), :]
source_vector = train_source_tfidf[len(left_source) + len(right_source):, :]

In [17]:
print(left_vector.shape)
print(right_vector.shape)
print(source_vector.shape)

(18, 480785)
(18, 480785)
(287227, 480785)


In [18]:
def get_max_similarity(source_vector, target_vector):
    similarity = source_vector * target_vector.T
    print(similarity.shape)
    max_similarity = np.max(similarity, axis=0)
    return max_similarity

In [19]:
left_max_similarity = get_max_similarity(source_vector, left_vector)
right_max_similarity = get_max_similarity(source_vector, right_vector)

(287227, 18)
(287227, 18)


In [20]:
np.average(left_max_similarity.toarray()[0])

0.42916393642777273

In [21]:
np.average(right_max_similarity.toarray()[0])

0.3965929901832175

In [22]:
for threshold in [0.1, 0.2, 0.3, 0.4, 0.5]:
    print('threshold: {}'.format(threshold))
    left_related = left_vector.toarray() > threshold
    right_related = right_vector.toarray() > threshold
    print('- left: {}'.format(np.sum(left_related, axis=1)))
    print('- left: {}'.format(np.sum(left_related)))
    print('- right: {}'.format(np.sum(right_related, axis=1)))
    print('- right: {}'.format(np.sum(right_related)))
    print()

threshold: 0.1
- left: [31 17 26 14 12 29 17 22 29  7 29 29 24 12 28 12 19 17]
- left: 374
- right: [20 24  9 30 30 16 16 12 22 13 23 17 17 11 11 17 26 17]
- right: 331

threshold: 0.2
- left: [2 5 3 5 3 7 3 3 5 2 6 9 8 7 3 3 4 2]
- left: 80
- right: [4 2 2 4 4 4 4 2 3 3 8 5 5 3 3 5 4 2]
- right: 67

threshold: 0.3
- left: [0 0 1 1 2 0 1 2 1 2 1 1 2 4 1 2 2 1]
- left: 24
- right: [1 1 2 1 1 2 2 1 2 1 1 2 2 2 2 3 0 1]
- right: 27

threshold: 0.4
- left: [0 0 0 1 1 0 1 1 0 1 0 0 0 0 0 2 1 1]
- left: 9
- right: [1 1 1 0 0 1 1 1 1 1 1 1 1 0 0 0 0 1]
- right: 12

threshold: 0.5
- left: [0 0 0 1 1 0 0 1 0 1 0 0 0 0 0 1 0 0]
- left: 5
- right: [0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0]
- right: 3



#### Entity Count

In [23]:
import torch

In [24]:
left_similarity = source_vector * left_vector.T
right_similarity = source_vector * right_vector.T

In [25]:
def count_entity(similarity_matrix, entities, k=3):
    """similarity_matrix: (203575, 18)"""
    similarity_matrix = torch.tensor(similarity_matrix.toarray())
    topk = torch.topk(similarity_matrix, k, dim=0)[1]  # [k, entity_number]: [10, 18]
    
    related_documents, related_summaries = [], []
    for i in range(topk.shape[1]):
        related_documents.append([])
        related_summaries.append([])
        for k in range(topk.shape[0]):
            related_documents[i].append(train_source[topk[k][i]])
            related_summaries[i].append(train_target[topk[k][i]])
    
    doc_counts, sum_counts = [], []
    for i in range(topk.shape[1]):
        source = ' '.join(related_documents[i])
        target = ' '.join(related_summaries[i])
        doc_counts.append(source.count(entities[i]['ent']))
        sum_counts.append(target.count(entities[i]['ent']))
    return doc_counts, sum_counts

In [26]:
left_doc_counts, left_sum_counts = count_entity(left_similarity, left_entities, k=10)

In [27]:
left_doc_counts

[0, 27, 0, 5, 17, 2, 0, 8, 4, 16, 1, 0, 5, 0, 7, 0, 1, 12]

In [33]:
left_sum_counts

[0, 10, 0, 0, 3, 0, 0, 3, 0, 5, 0, 0, 1, 0, 1, 0, 0, 3]

In [28]:
sum(left_doc_counts) / len(left_doc_counts)

5.833333333333333

In [29]:
sum(left_sum_counts) / len(left_sum_counts)

1.4444444444444444

In [30]:
right_doc_counts, right_sum_counts = count_entity(right_similarity, right_entities, k=10)

In [31]:
sum(right_doc_counts) / len(right_doc_counts)

8.555555555555555

In [32]:
sum(right_sum_counts) / len(right_sum_counts)

1.8333333333333333

In [34]:
left_entities[1]

{'start': 74,
 'end': 79,
 'label': 1,
 'type': 'PERSON',
 'ent': 'David',
 'prior': 0.00992584228515625,
 'posterior': 0.94140625,
 'id': 1513}